import multiprocessing from _llm import LLM from _arango import arango from langchain_text_splitters import CharacterTextSplitter import difflib import re import random from time import sleep import traceback from pprint import pprint from print_color import * def check_name(person, answer_person, text): print_yellow(person, " - ", answer_person) same = False # If full name similarity is below a threshold (e.g., 0.5), compare first names only # If person only has one name, first or last, compare that to first and last name of answer_person if len(person.strip().split()) == 1: llm = LLM() answer_first_name = answer_person.split()[0].strip() answer_last_name = answer_person.split()[-1].strip() if difflib.SequenceMatcher(None, person, answer_first_name).ratio() > 0.9: if answer_last_name in text: same = True else: # Count how many time the first name appears in the first_names list first_names = [ i["name"].split()[0] for i in db.collection("persons").all() ] first_name_count = first_names.count(answer_first_name) if first_name_count == 1: same = True else: answer = llm.generate( f'Nämns någon med efternamnet "{answer_last_name}" i texten nedan?\n\n"""{text[:6000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"' ) if "JA" in answer: same = True elif difflib.SequenceMatcher(None, person, answer_last_name).ratio() > 0.9: if answer_first_name in text: same = True else: answer = llm.generate( f'Nämns någon med förnamnet "{answer_first_name}" i texten nedan?\n\n"""{text[:6000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"' ) if "JA" in answer: same = True else: name_similarity = difflib.SequenceMatcher(None, person, answer_person).ratio() print("Similarity:", name_similarity) # person_first_name = person.split()[0] # answer_person_first_name = answer_person.split()[0] # first_name_similarity = difflib.SequenceMatcher( # None, person_first_name, answer_person_first_name # ).ratio() # person_last_name = person.split()[-1] # answer_person_last_name = answer_person.split()[-1] # print("new:", name_similarity) if name_similarity > 0.9: same = True return same def execute_query_with_retry(db, query, max_retries=5, delay=2): for attempt in range(max_retries): try: result = db.aql.execute(query) # If the result is a string, raise an exception if isinstance(result, str): raise ValueError(f"Unexpected result from database: {result}") return list(result) except Exception as e: print(f"Error executing query, attempt {attempt+1}: {e}") sleep(delay) # If we've exhausted all retries, re-raise the last exception raise # Then, in your extract_persons function: def extract_persons(interrogation): known_persons = { "Douglas": "Douglas Bengtsson", "Rashid": "Rashid Sheiksaid", "Emanuel": "Emanuel Johansson", "Robert": "Robert Bengtsson", } sleep(random.uniform(0.05, 0.3)) print("INTERROGATION:", interrogation["_key"]) q = "for doc in persons filter doc.other != true return doc" result = execute_query_with_retry(db, q) persons_docs = list(result) persons = [i["name"].strip() for i in persons_docs] first_names = {i["name"].split()[0].strip(): i["name"] for i in persons_docs} persons_dict = {i["name"]: i for i in persons_docs} text_splitter = CharacterTextSplitter( separator="\n\n", chunk_size=4000, chunk_overlap=0, length_function=len, is_separator_regex=False, ) chunks = text_splitter.split_text(interrogation["text"]) llm = LLM( chat=True, system_prompt="Du är en assistent som hjälper till att hitta personer i ett polisförhör. Du får en del av texten från förhöret åt gången. Svara bara när personen finns i den del du får, hitta inte på personer.", ) names = [] for chunk in chunks: # Find persons in the text prompt = f'''Det här är en text från ett polisförhör där {interrogation["person"]} förhörs:\n """{chunk}"""\n Vilka personer nämns i texten som inte förekommit tidigare? Svara ENBART med en pythonformaterad lista av namn. Exempel på svar för att du ska förstå formen: "["namn1", "namn2", "namn3"]". Jag är inte intresserad av förhörsledaren eller personen som förhörs.''' response = llm.generate(prompt) response = re.sub(r"[^a-zA-ZåäöÅÄÖ\- ,]", "", response).replace(" namn ", "") for name in [i.strip() for i in response.split(",") if len(i) > 2]: if name not in names: names.append(name) for name in names: # Compare the person to a list of known persons prompt = f'''Jag vill veta vem {name} är. Kolla på förhöret nedan och svara om du hittar något om personen där. """{chunk}"""\n Vem är {name}? Svara bara med sådant som finns i texten.''' info = llm.generate(prompt) person = None # Reverse name if name in persons: person = persons_dict[name] elif name in known_persons: person = persons_dict[known_persons[name]] elif name.split().reverse() in persons: print("Vände och hittade ✌️", name.split().reverse()) person = persons_dict[name.split().reverse()] else: closest_matches = difflib.get_close_matches(name, persons, n=4, cutoff=0.3) if name.split()[0] in first_names: if first_names[name.split()[0]] not in closest_matches: closest_matches.append(first_names[name.split()[0]]) persons_string = "\n".join(closest_matches) prompt = f"""Jag behöver identifiera {name}. Nedan är en lista på personer det kanske skulle kunna vara:\n {persons_string}\n Är {name} någon av dessa personer? I texten kan personen stå med bara sitt förnamn eller efternamn, så kolla speciellt efter namn i listan där förnamnet eller efternamnet stämmer. Namnet kan också vara felstavat, men inte ett helt annat namn. Svara BARA med namnet på personen ur listan. Är du inte säker så svara "None".""" answer_person = llm.generate(prompt) if answer_person in persons and check_name( name, answer_person, interrogation["text"] ): person = persons_dict[answer_person] if person: print_green(f'{name} identified: {person["name"]}', "\n") if "info" not in person: person["info"] = [] if info not in person["info"]: person["info"].append(info) if "mentioned_in_interrogation" not in person: person["mentioned_in_interrogation"] = [] if interrogation["_key"] not in person["mentioned_in_interrogation"]: person["mentioned_in_interrogation"].append(interrogation["_key"]) if "mentioned_as" not in person: person["mentioned_as"] = [] if {name: interrogation["_key"]} not in person["mentioned_as"]: person["mentioned_as"].append({name: interrogation["_key"]}) db.collection("persons").update(person, check_rev=False) # db.collection("all_relations").insert( # { # "_from": interrogation["person_id"], # "_to": person["_id"], # "relation": "mentioned_by", # "mentions": [{'interrogation': interrogation["_key"], "date": interrogation["date"], "mentioned_as": info}], # "_key": f'{interrogation["_key"]}_{person["_key"]}' # }, # overwrite_mode="update", # merge=True, # ) else: print(f"\033[91m{name} not identified\033[0m") print_yellow( "\n".join([f"- {i}" for i in persons_string.split("\n")]), "\n" ) print() _key = arango.fix_key_name(name) #TODO Are there multiple persons with the same name? # If no confirmed person was identified, create a new person or add to another unconfirmed person if not db.collection("persons").get(_key): if db.collection("persons").get(_key): doc = db.collection("persons").get(_key) else: doc = { "_key": _key, "name": name, "info": [info], "other": True, "confirmed": False, "mentioned_in_interrogation": [interrogation["_key"]], } else: doc = db.collection("persons").get(_key) if interrogation["_key"] not in doc["mentioned_in_interrogation"]: doc["mentioned_in_interrogation"].append(interrogation["_key"]) if info not in doc["info"]: doc["info"].append(info) db.collection("persons").insert(doc, merge=False, overwrite_mode='update') # db.collection("all_relations").insert( # { # "_from": interrogation["person_id"], # "_to": doc["_id"], # "relation": "mentioned_by", # 'other': True, # "mentions": [{'interrogation': interrogation["_key"], "date": interrogation["date"], "mentioned_as": info}], # "_key": f'{interrogation["_key"]}_{doc["_key"]}' # }, # overwrite_mode="update", # merge=True, # ) if __name__ == "__main__": db = arango.db q = 'for doc in interrogations filter doc._key == "Markus_Lindahl_2023-02-20_p.93" return doc' interrogations = list(db.aql.execute(q)) interrogations.sort(key=lambda x: x["date"]) persons = list(db.collection("persons").all()) interrogations_done = [] for person in persons: if ( "mentioned_in_interrogation" in person and person["mentioned_in_interrogation"] ): for interrogation in person["mentioned_in_interrogation"]: interrogations_done.append(interrogation) # interrogations = [ # interrogation # for interrogation in interrogations # if interrogation["_key"] not in set(interrogations_done) # ] # print("Number of interrogations to process:", len(interrogations)) # q = 'for doc in interrogations return doc' # interrogations = list(db.aql.execute(q)) # # Filter out interrogations that have their _key in the rumors collection # q = 'for rumor in rumors return rumor._key' # rumors = list(db.aql.execute(q)) # interrogations = [interrogation for interrogation in interrogations if interrogation['_key'] not in rumors] # print('Number of interrogations to process:', len(interrogations)) for interrogation in interrogations: extract_persons(interrogation) exit() with multiprocessing.Pool(processes=3) as pool: pool.map(extract_persons, interrogations)