mala/extract_roles.py

import difflib
from _arango import arango
# filename = "Huvudprotokoll.pdf"

# doc = fitz.open(f"pdfs/{filename}")

# def group_words_by_y(words, tolerance=2):
#     # Sort the words by their y-coordinate
#     words.sort(key=lambda word: word[1])

#     # Group the words by their rounded y-coordinate
#     grouped_words = itertools.groupby(words, key=lambda word: round(word[1] / tolerance))

#     # Sort the words in each group by their x-coordinate and combine the text
#     combined_words = [' '.join(word[4] for word in sorted(group, key=lambda word: word[0])) for _, group in grouped_words]

#     return combined_words

# append = False
# for page in doc.pages(1,3):
#     words = []
#     text_words = page.get_text('words', sort=True)
#     for word in text_words:
#         if append:
#             words.append(word)
#         if word[4] == "Brottsplatsadress":
#             append = True
#     combined_words = group_words_by_y(words, tolerance=5)
#     for word in combined_words:
#         last_space_index = word.rfind(' ')
#         if last_space_index != -1:
#             first_part = word[:last_space_index]
#             if ',' in first_part:
#                 word_parts = first_part.split(',')
#                 first_part = word_parts[1].strip() + ' ' + word_parts[0].strip()
#             second_part = word[last_space_index+1:]
#         else:
#             first_part = word
#             second_part = ''
#         print(first_part.strip(), ';', second_part.strip())

# Take the output and clean it up in Excel


data = [
    {"name": "Carl-William Ahlqvist", "role": "Misstänkt"},
    {"name": "Elias David Ahlqvist", "role": "Vittne"},
    {"name": "Marlene Linnea Ahlqvist", "role": "Misstänkt"},
    {"name": "Jhonny Kaj lngemund Backman", "role": "Vittne"},
    {"name": "Louise Solveig Karin Bengtsson", "role": "Vittne"},
    {"name": "Ove Robert Greger Bengtsson", "role": "Misstänkt"},
    {"name": "Björn Willy Johnny Borell", "role": "Vittne"},
    {"name": "Lars Victor Bystedt", "role": "Vittne"},
    {"name": "Svea Helena Caroline Enberg", "role": "Vittne"},
    {"name": "Agnes Marie Hällgren", "role": "Vittne"},
    {"name": "Anna Jessica Maria Höglund", "role": "Vittne"},
    {"name": "Kent Åke Höglund", "role": "Vittne"},
    {"name": "Dan Anton Tobias Johansson", "role": "Vittne"},
    {"name": "Fredrik Max Johansson", "role": "Vittne"},
    {"name": "Ivar Emanuel Johansson", "role": "Målsägande"},
    {"name": "Rut Marit Beatrice Johansson", "role": "Målsägande"},
    {"name": "Lars Anders Markus Karlsson", "role": "Vittne"},
    {"name": "Eija Inkeri Kjäll", "role": "Vittne"},
    {"name": "Neo Arvid Magnus Larsson", "role": "Vittne"},
    {"name": "Lena Marie Susann Lind", "role": "Vittne"},
    {"name": "Elin Linnea Maria Lindell", "role": "Vittne"},
    {"name": "Sofi Teresia Lindwall", "role": "Vittne"},
    {"name": "Lars Thorbjöm Lundgren", "role": "Vittne"},
    {"name": "Fredrik Lars Lundmark", "role": "Vittne"},
    {"name": "Lars-Erik Mikael Molin", "role": "Vittne"},
    {"name": "Per Lars-Erik Molin", "role": "Vittne"},
    {"name": "Robin Alex Nieminen", "role": "Misstänkt"},
    {"name": "Malin Charlotta Nyström", "role": "Vittne"},
    {"name": "Ola Folke Magnus Pålsson", "role": "Vittne"},
    {"name": "Anna Margareta Renlund", "role": "Vittne"},
    {"name": "Karl Emanuel Renström", "role": "Vittne"},
    {"name": "Karl Henrik Sjölund", "role": "Vittne"},
    {"name": "Sven Bertil Stenberg", "role": "Vittne"},
    {"name": "BemdtPatrik Svahn", "role": "Vittne"},
    {"name": "Nea Christina Vänstedt", "role": "Vittne"},
    {"name": "Ola Nils Vänstedt", "role": "Vittne"},
    {"name": "Ulf Peder Öhman", "role": "Vittne"}
]

persons = {i['name']: i['role'] for i in data}

list_of_names = [i['name'] for i in data]

interrogations = arango.db.collection('interrogations').all()

for doc in interrogations:
    most_similar_name = None
    most_similar_names = difflib.get_close_matches(doc['person'], list_of_names, n=2)
    for name in most_similar_names:
        doc_names = set(doc['person'].split())
        name_parts = set(name.split())
        if doc_names.issubset(name_parts):
            most_similar_name = name
            break


    if not most_similar_name:
        doc['role'] = None
        print("\033[91m" + doc['person'] + "\033[0m")
    else:
        doc['role'] = persons[most_similar_name]
        print("\033[92m" + doc['person'] + "\033[0m")
        doc['full_name'] = most_similar_name

    arango.db.collection('interrogations').update(doc, keep_none=False)