import difflib from _arango import arango # filename = "Huvudprotokoll.pdf" # doc = fitz.open(f"pdfs/{filename}") # def group_words_by_y(words, tolerance=2): # # Sort the words by their y-coordinate # words.sort(key=lambda word: word[1]) # # Group the words by their rounded y-coordinate # grouped_words = itertools.groupby(words, key=lambda word: round(word[1] / tolerance)) # # Sort the words in each group by their x-coordinate and combine the text # combined_words = [' '.join(word[4] for word in sorted(group, key=lambda word: word[0])) for _, group in grouped_words] # return combined_words # append = False # for page in doc.pages(1,3): # words = [] # text_words = page.get_text('words', sort=True) # for word in text_words: # if append: # words.append(word) # if word[4] == "Brottsplatsadress": # append = True # combined_words = group_words_by_y(words, tolerance=5) # for word in combined_words: # last_space_index = word.rfind(' ') # if last_space_index != -1: # first_part = word[:last_space_index] # if ',' in first_part: # word_parts = first_part.split(',') # first_part = word_parts[1].strip() + ' ' + word_parts[0].strip() # second_part = word[last_space_index+1:] # else: # first_part = word # second_part = '' # print(first_part.strip(), ';', second_part.strip()) # Take the output and clean it up in Excel data = [ {"name": "Carl-William Ahlqvist", "role": "Misstänkt"}, {"name": "Elias David Ahlqvist", "role": "Vittne"}, {"name": "Marlene Linnea Ahlqvist", "role": "Misstänkt"}, {"name": "Jhonny Kaj lngemund Backman", "role": "Vittne"}, {"name": "Louise Solveig Karin Bengtsson", "role": "Vittne"}, {"name": "Ove Robert Greger Bengtsson", "role": "Misstänkt"}, {"name": "Björn Willy Johnny Borell", "role": "Vittne"}, {"name": "Lars Victor Bystedt", "role": "Vittne"}, {"name": "Svea Helena Caroline Enberg", "role": "Vittne"}, {"name": "Agnes Marie Hällgren", "role": "Vittne"}, {"name": "Anna Jessica Maria Höglund", "role": "Vittne"}, {"name": "Kent Åke Höglund", "role": "Vittne"}, {"name": "Dan Anton Tobias Johansson", "role": "Vittne"}, {"name": "Fredrik Max Johansson", "role": "Vittne"}, {"name": "Ivar Emanuel Johansson", "role": "Målsägande"}, {"name": "Rut Marit Beatrice Johansson", "role": "Målsägande"}, {"name": "Lars Anders Markus Karlsson", "role": "Vittne"}, {"name": "Eija Inkeri Kjäll", "role": "Vittne"}, {"name": "Neo Arvid Magnus Larsson", "role": "Vittne"}, {"name": "Lena Marie Susann Lind", "role": "Vittne"}, {"name": "Elin Linnea Maria Lindell", "role": "Vittne"}, {"name": "Sofi Teresia Lindwall", "role": "Vittne"}, {"name": "Lars Thorbjöm Lundgren", "role": "Vittne"}, {"name": "Fredrik Lars Lundmark", "role": "Vittne"}, {"name": "Lars-Erik Mikael Molin", "role": "Vittne"}, {"name": "Per Lars-Erik Molin", "role": "Vittne"}, {"name": "Robin Alex Nieminen", "role": "Misstänkt"}, {"name": "Malin Charlotta Nyström", "role": "Vittne"}, {"name": "Ola Folke Magnus Pålsson", "role": "Vittne"}, {"name": "Anna Margareta Renlund", "role": "Vittne"}, {"name": "Karl Emanuel Renström", "role": "Vittne"}, {"name": "Karl Henrik Sjölund", "role": "Vittne"}, {"name": "Sven Bertil Stenberg", "role": "Vittne"}, {"name": "BemdtPatrik Svahn", "role": "Vittne"}, {"name": "Nea Christina Vänstedt", "role": "Vittne"}, {"name": "Ola Nils Vänstedt", "role": "Vittne"}, {"name": "Ulf Peder Öhman", "role": "Vittne"} ] persons = {i['name']: i['role'] for i in data} list_of_names = [i['name'] for i in data] interrogations = arango.db.collection('interrogations').all() for doc in interrogations: most_similar_name = None most_similar_names = difflib.get_close_matches(doc['person'], list_of_names, n=2) for name in most_similar_names: doc_names = set(doc['person'].split()) name_parts = set(name.split()) if doc_names.issubset(name_parts): most_similar_name = name break if not most_similar_name: doc['role'] = None print("\033[91m" + doc['person'] + "\033[0m") else: doc['role'] = persons[most_similar_name] print("\033[92m" + doc['person'] + "\033[0m") doc['full_name'] = most_similar_name arango.db.collection('interrogations').update(doc, keep_none=False)