You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
327 lines
11 KiB
327 lines
11 KiB
from _chroma import chroma |
|
from _arango import arango, db |
|
from _llm import LLM |
|
from print_color import * |
|
import multiprocessing |
|
from typing import Union |
|
import difflib |
|
|
|
class Person: |
|
def __init__(self): |
|
self.info = None |
|
self.summary = None |
|
|
|
def make_summary(self): |
|
llm = LLM(chat=False, small=True) |
|
if len(self.info) > 100: |
|
summary_prompt = f"""Nedan är olika bitar med information om en person:\n |
|
{self.info}\n\nSammanfatta dessa på ett detaljerat sätt. Var noga med namn, platser, händelser och relationer. |
|
Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat. """ |
|
else: |
|
summary_prompt = f"""Nedan är information om en person:\n |
|
{self.info}\n\nSammanfatta denna information så detaljerat som möjligt. Var noga med namn, platser, händelser och relationer. |
|
Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat. """ |
|
|
|
self.summary = llm.generate(summary_prompt) |
|
|
|
|
|
class UnverifiedPerson(Person): |
|
def __init__(self, doc: dict, interrogation:str=None): |
|
super().__init__() |
|
self.doc = doc |
|
for k, v in self.doc.items(): |
|
setattr(self, k, v) |
|
if 'info' in doc: |
|
self.info = "\n".join(doc["info"]) |
|
else: |
|
self.info = None |
|
if 'name' in doc: |
|
self.name = doc["name"] |
|
else: |
|
self.name = '' |
|
|
|
|
|
class FoundPerson(Person): |
|
""" |
|
Represents a person found in ArangoDB. |
|
|
|
Attributes: |
|
name (str): The name of the person. |
|
info (str): Additional information about the person. |
|
key (str): A unique identifier for the person. |
|
doc (str): The persons document in ArangoDB. |
|
summary (str): A summary of the person's details. |
|
""" |
|
|
|
def __init__(self, db, name, key): |
|
super().__init__() |
|
self.name = name |
|
self.key = key |
|
self.doc = db.collection("persons").get(key) |
|
self.info = "\n".join(self.doc["info"]) |
|
self.summary = self.make_summary() |
|
|
|
|
|
def check_name(person, answer_person, text): |
|
print_yellow(person, " - ", answer_person) |
|
same = False |
|
|
|
# If person only has one name, first or last, compare that to first and last name of answer_person |
|
print('Length person:', len(person.strip().split())) |
|
|
|
if len(person.strip().split()) == 1: |
|
llm = LLM() |
|
answer_first_name = answer_person.split()[0].strip() |
|
answer_last_name = answer_person.split()[-1].strip() |
|
|
|
first_name_similarity = difflib.SequenceMatcher( |
|
None, person, answer_first_name |
|
).ratio() |
|
|
|
last_name_similarity = difflib.SequenceMatcher( |
|
None, person, answer_last_name |
|
).ratio() |
|
|
|
print("First name similarity:", first_name_similarity) |
|
print("Last name similarity:", last_name_similarity) |
|
|
|
if difflib.SequenceMatcher(None, person, answer_first_name).ratio() > 0.9: |
|
if answer_last_name in text: |
|
same = True |
|
else: |
|
# Count how many time the first name appears in the first_names list |
|
first_names = [ |
|
i["name"].split()[0] for i in db.collection("persons").all() |
|
] |
|
first_name_count = first_names.count(answer_first_name) |
|
print("First name count:", first_name_count) |
|
if first_name_count == 1: |
|
same = True |
|
else: |
|
llm = LLM(small=True) |
|
answer = llm.generate( |
|
f'Nämns någon med efternamnet "{answer_last_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"' |
|
) |
|
if "JA" in answer: |
|
same = True |
|
|
|
elif difflib.SequenceMatcher(None, person, answer_last_name).ratio() > 0.9: |
|
if answer_first_name in text: |
|
same = True |
|
else: |
|
llm = LLM(small=True) |
|
answer = llm.generate( |
|
f'Nämns någon med förnamnet "{answer_first_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"' |
|
) |
|
if "JA" in answer: |
|
same = True |
|
|
|
else: |
|
name_similarity = difflib.SequenceMatcher(None, person, answer_person).ratio() |
|
print("Similarity:", name_similarity) |
|
|
|
|
|
if name_similarity > 0.85: |
|
same = True |
|
|
|
return same |
|
|
|
|
|
def find_with_llm(unverified_person: UnverifiedPerson): |
|
unverified_person.make_summary() |
|
|
|
|
|
|
|
def find_person( |
|
unverified_person: Union[dict, UnverifiedPerson, str] = None, |
|
name: str = None, |
|
key: str = None, |
|
): |
|
""" |
|
Finds a person in the Chroma db. |
|
Args: |
|
unverified_person (Union[dict, UnverifiedPerson]): The unverified person to idetify. |
|
|
|
Returns: |
|
FoundPerson: The found person |
|
""" |
|
|
|
if not isinstance(unverified_person, UnverifiedPerson): |
|
if unverified_person is None: |
|
unverified_person = {} |
|
if name: |
|
unverified_person['name'] = name |
|
if key: |
|
unverified_person['_key'] = key |
|
|
|
unverified_person = UnverifiedPerson(unverified_person) |
|
|
|
if "is_not" in unverified_person.doc: |
|
list_filter_isnot = [unverified_person.name].append( |
|
unverified_person.doc["is_not"] |
|
) |
|
else: |
|
list_filter_isnot = [unverified_person.name] |
|
|
|
filter_isnot = {"name": {"$nin": list_filter_isnot}} |
|
|
|
query_results = chroma.query( |
|
query_texts=[unverified_person.name], |
|
n_results=1, |
|
where=filter_isnot, |
|
collection="mala_persons", |
|
) |
|
|
|
distance = query_results["distances"][0][0] |
|
print_purple(query_results["metadatas"][0][0]["name"], distance) |
|
|
|
if distance > 1: |
|
unverified_person.make_summary() |
|
query_results = chroma.query( |
|
query_texts=[unverified_person.summary], |
|
n_results=1, |
|
where=filter_isnot, |
|
collection="mala_persons_info", |
|
) |
|
distance = query_results["distances"][0][0] |
|
print_yellow(query_results["metadatas"][0][0]["name"], distance) |
|
if distance > 1: |
|
return None |
|
|
|
# return unverified_person, found_person, False |
|
|
|
print_red("NAME", query_results["documents"][0][0]) |
|
found_person = FoundPerson( |
|
db, |
|
name=query_results["metadatas"][0][0]["name"], |
|
key=query_results["metadatas"][0][0]["_key"], |
|
) |
|
|
|
return found_person |
|
|
|
|
|
def identify(unverified_person: Union[dict, UnverifiedPerson]): |
|
""" |
|
Finds and summarizes a person based on the provided person document. |
|
|
|
Args: |
|
person_doc (dict): The person document containing information about the person. |
|
|
|
Returns: |
|
dict: A dictionary containing the following keys: |
|
- "unverified_person": An instance of the UnverifiedPerson class representing the unverified person. |
|
- "found_person": An instance of the FoundPerson class representing the found person. |
|
- "suggestions": A list of tuples containing suggestions and interrogation IDs. |
|
""" |
|
llm = LLM(small=True) |
|
|
|
if not isinstance(unverified_person, UnverifiedPerson): |
|
unverified_person = UnverifiedPerson(unverified_person) |
|
found_person = find_person(unverified_person) |
|
|
|
if not found_person: |
|
return { |
|
"unverified_person": unverified_person, |
|
"found_person": None, |
|
"suggestions": [ |
|
(None, i) for i in unverified_person.doc["mentioned_in_interrogation"] |
|
], |
|
} |
|
|
|
suggestions = [] |
|
for interrogation_id in unverified_person.doc["mentioned_in_interrogation"]: |
|
interrogation_data = db.collection("interrogations").get(interrogation_id) |
|
text = interrogation_data["text"] |
|
|
|
answer_prompt = f'''I texten nedan omnämns en "{unverified_person.name}" och jag försöker förstå om det kan vara exempelvis ett felstavat namn eller smeknamn för en annan person.\n |
|
TEXT: |
|
"""{text}"""\n |
|
|
|
På andra ställen i polisens förundersökning finns en person som heter "{found_person.name}", och som beskrivs så här: |
|
"""{found_person.summary}"""\n |
|
Verkar det troligt att personen som kallas {unverified_person.name} är samma person som {found_person.name}? Svara bara JA eller NEJ, samt en kort förklaring till varför. |
|
''' |
|
answer = llm.generate(answer_prompt) |
|
suggestions.append((answer, interrogation_data)) |
|
|
|
return { |
|
"unverified_person": unverified_person, |
|
"found_person": found_person, |
|
"suggestions": suggestions, |
|
} |
|
|
|
|
|
def verify( |
|
db, |
|
answer=None, |
|
unverified_person=None, |
|
found_person=None, |
|
interrogation_key=None, |
|
): |
|
""" |
|
Verifies the answer for a person's identification in an interrogation. |
|
|
|
Args: |
|
db: The database object. |
|
answer (str): The answer for the person's identification. Can be "Yes", "No", or "Unknown". |
|
person (dict): The person's information. |
|
person_in_arango (dict): The person's information in ArangoDB. |
|
text (str): The text mentioning the person in the interrogation. |
|
interrogation_key (str): The key identifying the interrogation. |
|
|
|
Returns: |
|
None |
|
""" |
|
print_blue("Answer:", answer) |
|
# If the answer is Yes |
|
if answer == "Yes": |
|
unverified_person.doc["mentioned_in_interrogation"].remove(interrogation_key) |
|
db.collection("persons").update(unverified_person.doc) |
|
|
|
found_person.doc["confirmed"] = True |
|
found_person.doc["info"] += found_person.doc["info"] |
|
found_person.doc["mentioned_in_interrogation"] += ["mentioned_in_interrogation"] |
|
|
|
print("Updated person in arango:") |
|
print_green( |
|
db.collection("persons").insert(found_person.doc, overwrite_mode="update") |
|
) |
|
if ( |
|
unverified_person.doc["mentioned_in_interrogation"] == [] |
|
and unverified_person.doc["_key"] != found_person.doc["_key"] |
|
): |
|
db.collection("other_persons").insert( |
|
unverified_person.doc, overwrite_mode="update" |
|
) |
|
db.collection("persons").delete(unverified_person.doc, check_rev=False) |
|
print_red(f"Removed {unverified_person.doc}") |
|
|
|
# If the answer is No |
|
if answer == "No": |
|
if "is_not" not in unverified_person.doc: |
|
unverified_person.doc["is_not"] = [] |
|
|
|
unverified_person.doc["is_not"].append([found_person.doc["name"]]) |
|
db.collection("persons").update( |
|
unverified_person.doc, merge=True, check_rev=False |
|
) |
|
|
|
# If the answer is Unknown |
|
if answer == "Unknown": |
|
db.collection("unknown").insert( |
|
{"name": unverified_person.name, "interrogation": interrogation_key}, |
|
overwrite=True, |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
persons = list(db.collection("persons").all()) |
|
|
|
q = "for doc in persons filter doc.other == true return doc" |
|
other_persons = [i for i in db.aql.execute(q)] |
|
|
|
for person in other_persons: |
|
print(find_person(person)) |
|
exit() |
|
# with multiprocessing.Pool() as pool: |
|
# pool.map(find_person, other_persons)
|
|
|