You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

110 lines
4.5 KiB

import difflib
from _arango import arango
# filename = "Huvudprotokoll.pdf"
# doc = fitz.open(f"pdfs/{filename}")
# def group_words_by_y(words, tolerance=2):
# # Sort the words by their y-coordinate
# words.sort(key=lambda word: word[1])
# # Group the words by their rounded y-coordinate
# grouped_words = itertools.groupby(words, key=lambda word: round(word[1] / tolerance))
# # Sort the words in each group by their x-coordinate and combine the text
# combined_words = [' '.join(word[4] for word in sorted(group, key=lambda word: word[0])) for _, group in grouped_words]
# return combined_words
# append = False
# for page in doc.pages(1,3):
# words = []
# text_words = page.get_text('words', sort=True)
# for word in text_words:
# if append:
# words.append(word)
# if word[4] == "Brottsplatsadress":
# append = True
# combined_words = group_words_by_y(words, tolerance=5)
# for word in combined_words:
# last_space_index = word.rfind(' ')
# if last_space_index != -1:
# first_part = word[:last_space_index]
# if ',' in first_part:
# word_parts = first_part.split(',')
# first_part = word_parts[1].strip() + ' ' + word_parts[0].strip()
# second_part = word[last_space_index+1:]
# else:
# first_part = word
# second_part = ''
# print(first_part.strip(), ';', second_part.strip())
# Take the output and clean it up in Excel
data = [
{"name": "Carl-William Ahlqvist", "role": "Misstänkt"},
{"name": "Elias David Ahlqvist", "role": "Vittne"},
{"name": "Marlene Linnea Ahlqvist", "role": "Misstänkt"},
{"name": "Jhonny Kaj lngemund Backman", "role": "Vittne"},
{"name": "Louise Solveig Karin Bengtsson", "role": "Vittne"},
{"name": "Ove Robert Greger Bengtsson", "role": "Misstänkt"},
{"name": "Björn Willy Johnny Borell", "role": "Vittne"},
{"name": "Lars Victor Bystedt", "role": "Vittne"},
{"name": "Svea Helena Caroline Enberg", "role": "Vittne"},
{"name": "Agnes Marie Hällgren", "role": "Vittne"},
{"name": "Anna Jessica Maria Höglund", "role": "Vittne"},
{"name": "Kent Åke Höglund", "role": "Vittne"},
{"name": "Dan Anton Tobias Johansson", "role": "Vittne"},
{"name": "Fredrik Max Johansson", "role": "Vittne"},
{"name": "Ivar Emanuel Johansson", "role": "Målsägande"},
{"name": "Rut Marit Beatrice Johansson", "role": "Målsägande"},
{"name": "Lars Anders Markus Karlsson", "role": "Vittne"},
{"name": "Eija Inkeri Kjäll", "role": "Vittne"},
{"name": "Neo Arvid Magnus Larsson", "role": "Vittne"},
{"name": "Lena Marie Susann Lind", "role": "Vittne"},
{"name": "Elin Linnea Maria Lindell", "role": "Vittne"},
{"name": "Sofi Teresia Lindwall", "role": "Vittne"},
{"name": "Lars Thorbjöm Lundgren", "role": "Vittne"},
{"name": "Fredrik Lars Lundmark", "role": "Vittne"},
{"name": "Lars-Erik Mikael Molin", "role": "Vittne"},
{"name": "Per Lars-Erik Molin", "role": "Vittne"},
{"name": "Robin Alex Nieminen", "role": "Misstänkt"},
{"name": "Malin Charlotta Nyström", "role": "Vittne"},
{"name": "Ola Folke Magnus Pålsson", "role": "Vittne"},
{"name": "Anna Margareta Renlund", "role": "Vittne"},
{"name": "Karl Emanuel Renström", "role": "Vittne"},
{"name": "Karl Henrik Sjölund", "role": "Vittne"},
{"name": "Sven Bertil Stenberg", "role": "Vittne"},
{"name": "BemdtPatrik Svahn", "role": "Vittne"},
{"name": "Nea Christina Vänstedt", "role": "Vittne"},
{"name": "Ola Nils Vänstedt", "role": "Vittne"},
{"name": "Ulf Peder Öhman", "role": "Vittne"}
]
persons = {i['name']: i['role'] for i in data}
list_of_names = [i['name'] for i in data]
interrogations = arango.db.collection('interrogations').all()
for doc in interrogations:
most_similar_name = None
most_similar_names = difflib.get_close_matches(doc['person'], list_of_names, n=2)
for name in most_similar_names:
doc_names = set(doc['person'].split())
name_parts = set(name.split())
if doc_names.issubset(name_parts):
most_similar_name = name
break
if not most_similar_name:
doc['role'] = None
print("\033[91m" + doc['person'] + "\033[0m")
else:
doc['role'] = persons[most_similar_name]
print("\033[92m" + doc['person'] + "\033[0m")
doc['full_name'] = most_similar_name
arango.db.collection('interrogations').update(doc, keep_none=False)