from _chroma import chroma from _arango import arango, db from _llm import LLM from print_color import * import multiprocessing from typing import Union import difflib class Person: def __init__(self): self.info = None self.summary = None def make_summary(self): llm = LLM(chat=False, small=True) if len(self.info) > 100: summary_prompt = f"""Nedan är olika bitar med information om en person:\n {self.info}\n\nSammanfatta dessa på ett detaljerat sätt. Var noga med namn, platser, händelser och relationer. Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat. """ else: summary_prompt = f"""Nedan är information om en person:\n {self.info}\n\nSammanfatta denna information så detaljerat som möjligt. Var noga med namn, platser, händelser och relationer. Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat. """ self.summary = llm.generate(summary_prompt) class UnverifiedPerson(Person): def __init__(self, doc: dict, interrogation:str=None): super().__init__() self.doc = doc for k, v in self.doc.items(): setattr(self, k, v) if 'info' in doc: self.info = "\n".join(doc["info"]) else: self.info = None if 'name' in doc: self.name = doc["name"] else: self.name = '' class FoundPerson(Person): """ Represents a person found in ArangoDB. Attributes: name (str): The name of the person. info (str): Additional information about the person. key (str): A unique identifier for the person. doc (str): The persons document in ArangoDB. summary (str): A summary of the person's details. """ def __init__(self, db, name, key): super().__init__() self.name = name self.key = key self.doc = db.collection("persons").get(key) self.info = "\n".join(self.doc["info"]) self.summary = self.make_summary() def check_name(person, answer_person, text): print_yellow(person, " - ", answer_person) same = False # If person only has one name, first or last, compare that to first and last name of answer_person print('Length person:', len(person.strip().split())) if len(person.strip().split()) == 1: llm = LLM() answer_first_name = answer_person.split()[0].strip() answer_last_name = answer_person.split()[-1].strip() first_name_similarity = difflib.SequenceMatcher( None, person, answer_first_name ).ratio() last_name_similarity = difflib.SequenceMatcher( None, person, answer_last_name ).ratio() print("First name similarity:", first_name_similarity) print("Last name similarity:", last_name_similarity) if difflib.SequenceMatcher(None, person, answer_first_name).ratio() > 0.9: if answer_last_name in text: same = True else: # Count how many time the first name appears in the first_names list first_names = [ i["name"].split()[0] for i in db.collection("persons").all() ] first_name_count = first_names.count(answer_first_name) print("First name count:", first_name_count) if first_name_count == 1: same = True else: llm = LLM(small=True) answer = llm.generate( f'Nämns någon med efternamnet "{answer_last_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"' ) if "JA" in answer: same = True elif difflib.SequenceMatcher(None, person, answer_last_name).ratio() > 0.9: if answer_first_name in text: same = True else: llm = LLM(small=True) answer = llm.generate( f'Nämns någon med förnamnet "{answer_first_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"' ) if "JA" in answer: same = True else: name_similarity = difflib.SequenceMatcher(None, person, answer_person).ratio() print("Similarity:", name_similarity) if name_similarity > 0.85: same = True return same def find_with_llm(unverified_person: UnverifiedPerson): unverified_person.make_summary() def find_person( unverified_person: Union[dict, UnverifiedPerson, str] = None, name: str = None, key: str = None, ): """ Finds a person in the Chroma db. Args: unverified_person (Union[dict, UnverifiedPerson]): The unverified person to idetify. Returns: FoundPerson: The found person """ if not isinstance(unverified_person, UnverifiedPerson): if unverified_person is None: unverified_person = {} if name: unverified_person['name'] = name if key: unverified_person['_key'] = key unverified_person = UnverifiedPerson(unverified_person) if "is_not" in unverified_person.doc: list_filter_isnot = [unverified_person.name].append( unverified_person.doc["is_not"] ) else: list_filter_isnot = [unverified_person.name] filter_isnot = {"name": {"$nin": list_filter_isnot}} query_results = chroma.query( query_texts=[unverified_person.name], n_results=1, where=filter_isnot, collection="mala_persons", ) distance = query_results["distances"][0][0] print_purple(query_results["metadatas"][0][0]["name"], distance) if distance > 1: unverified_person.make_summary() query_results = chroma.query( query_texts=[unverified_person.summary], n_results=1, where=filter_isnot, collection="mala_persons_info", ) distance = query_results["distances"][0][0] print_yellow(query_results["metadatas"][0][0]["name"], distance) if distance > 1: return None # return unverified_person, found_person, False print_red("NAME", query_results["documents"][0][0]) found_person = FoundPerson( db, name=query_results["metadatas"][0][0]["name"], key=query_results["metadatas"][0][0]["_key"], ) return found_person def identify(unverified_person: Union[dict, UnverifiedPerson]): """ Finds and summarizes a person based on the provided person document. Args: person_doc (dict): The person document containing information about the person. Returns: dict: A dictionary containing the following keys: - "unverified_person": An instance of the UnverifiedPerson class representing the unverified person. - "found_person": An instance of the FoundPerson class representing the found person. - "suggestions": A list of tuples containing suggestions and interrogation IDs. """ llm = LLM(small=True) if not isinstance(unverified_person, UnverifiedPerson): unverified_person = UnverifiedPerson(unverified_person) found_person = find_person(unverified_person) if not found_person: return { "unverified_person": unverified_person, "found_person": None, "suggestions": [ (None, i) for i in unverified_person.doc["mentioned_in_interrogation"] ], } suggestions = [] for interrogation_id in unverified_person.doc["mentioned_in_interrogation"]: interrogation_data = db.collection("interrogations").get(interrogation_id) text = interrogation_data["text"] answer_prompt = f'''I texten nedan omnämns en "{unverified_person.name}" och jag försöker förstå om det kan vara exempelvis ett felstavat namn eller smeknamn för en annan person.\n TEXT: """{text}"""\n På andra ställen i polisens förundersökning finns en person som heter "{found_person.name}", och som beskrivs så här: """{found_person.summary}"""\n Verkar det troligt att personen som kallas {unverified_person.name} är samma person som {found_person.name}? Svara bara JA eller NEJ, samt en kort förklaring till varför. ''' answer = llm.generate(answer_prompt) suggestions.append((answer, interrogation_data)) return { "unverified_person": unverified_person, "found_person": found_person, "suggestions": suggestions, } def verify( db, answer=None, unverified_person=None, found_person=None, interrogation_key=None, ): """ Verifies the answer for a person's identification in an interrogation. Args: db: The database object. answer (str): The answer for the person's identification. Can be "Yes", "No", or "Unknown". person (dict): The person's information. person_in_arango (dict): The person's information in ArangoDB. text (str): The text mentioning the person in the interrogation. interrogation_key (str): The key identifying the interrogation. Returns: None """ print_blue("Answer:", answer) # If the answer is Yes if answer == "Yes": unverified_person.doc["mentioned_in_interrogation"].remove(interrogation_key) db.collection("persons").update(unverified_person.doc) found_person.doc["confirmed"] = True found_person.doc["info"] += found_person.doc["info"] found_person.doc["mentioned_in_interrogation"] += ["mentioned_in_interrogation"] print("Updated person in arango:") print_green( db.collection("persons").insert(found_person.doc, overwrite_mode="update") ) if ( unverified_person.doc["mentioned_in_interrogation"] == [] and unverified_person.doc["_key"] != found_person.doc["_key"] ): db.collection("other_persons").insert( unverified_person.doc, overwrite_mode="update" ) db.collection("persons").delete(unverified_person.doc, check_rev=False) print_red(f"Removed {unverified_person.doc}") # If the answer is No if answer == "No": if "is_not" not in unverified_person.doc: unverified_person.doc["is_not"] = [] unverified_person.doc["is_not"].append([found_person.doc["name"]]) db.collection("persons").update( unverified_person.doc, merge=True, check_rev=False ) # If the answer is Unknown if answer == "Unknown": db.collection("unknown").insert( {"name": unverified_person.name, "interrogation": interrogation_key}, overwrite=True, ) if __name__ == "__main__": persons = list(db.collection("persons").all()) q = "for doc in persons filter doc.other == true return doc" other_persons = [i for i in db.aql.execute(q)] for person in other_persons: print(find_person(person)) exit() # with multiprocessing.Pool() as pool: # pool.map(find_person, other_persons)