You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

327 lines
11 KiB

from _chroma import chroma
from _arango import arango, db
from _llm import LLM
from print_color import *
import multiprocessing
from typing import Union
import difflib
class Person:
def __init__(self):
self.info = None
self.summary = None
def make_summary(self):
llm = LLM(chat=False, small=True)
if len(self.info) > 100:
summary_prompt = f"""Nedan är olika bitar med information om en person:\n
{self.info}\n\nSammanfatta dessa på ett detaljerat sätt. Var noga med namn, platser, händelser och relationer.
Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat. """
else:
summary_prompt = f"""Nedan är information om en person:\n
{self.info}\n\nSammanfatta denna information så detaljerat som möjligt. Var noga med namn, platser, händelser och relationer.
Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat. """
self.summary = llm.generate(summary_prompt)
class UnverifiedPerson(Person):
def __init__(self, doc: dict, interrogation:str=None):
super().__init__()
self.doc = doc
for k, v in self.doc.items():
setattr(self, k, v)
if 'info' in doc:
self.info = "\n".join(doc["info"])
else:
self.info = None
if 'name' in doc:
self.name = doc["name"]
else:
self.name = ''
class FoundPerson(Person):
"""
Represents a person found in ArangoDB.
Attributes:
name (str): The name of the person.
info (str): Additional information about the person.
key (str): A unique identifier for the person.
doc (str): The persons document in ArangoDB.
summary (str): A summary of the person's details.
"""
def __init__(self, db, name, key):
super().__init__()
self.name = name
self.key = key
self.doc = db.collection("persons").get(key)
self.info = "\n".join(self.doc["info"])
self.summary = self.make_summary()
def check_name(person, answer_person, text):
print_yellow(person, " - ", answer_person)
same = False
# If person only has one name, first or last, compare that to first and last name of answer_person
print('Length person:', len(person.strip().split()))
if len(person.strip().split()) == 1:
llm = LLM()
answer_first_name = answer_person.split()[0].strip()
answer_last_name = answer_person.split()[-1].strip()
first_name_similarity = difflib.SequenceMatcher(
None, person, answer_first_name
).ratio()
last_name_similarity = difflib.SequenceMatcher(
None, person, answer_last_name
).ratio()
print("First name similarity:", first_name_similarity)
print("Last name similarity:", last_name_similarity)
if difflib.SequenceMatcher(None, person, answer_first_name).ratio() > 0.9:
if answer_last_name in text:
same = True
else:
# Count how many time the first name appears in the first_names list
first_names = [
i["name"].split()[0] for i in db.collection("persons").all()
]
first_name_count = first_names.count(answer_first_name)
print("First name count:", first_name_count)
if first_name_count == 1:
same = True
else:
llm = LLM(small=True)
answer = llm.generate(
f'Nämns någon med efternamnet "{answer_last_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"'
)
if "JA" in answer:
same = True
elif difflib.SequenceMatcher(None, person, answer_last_name).ratio() > 0.9:
if answer_first_name in text:
same = True
else:
llm = LLM(small=True)
answer = llm.generate(
f'Nämns någon med förnamnet "{answer_first_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"'
)
if "JA" in answer:
same = True
else:
name_similarity = difflib.SequenceMatcher(None, person, answer_person).ratio()
print("Similarity:", name_similarity)
if name_similarity > 0.85:
same = True
return same
def find_with_llm(unverified_person: UnverifiedPerson):
unverified_person.make_summary()
def find_person(
unverified_person: Union[dict, UnverifiedPerson, str] = None,
name: str = None,
key: str = None,
):
"""
Finds a person in the Chroma db.
Args:
unverified_person (Union[dict, UnverifiedPerson]): The unverified person to idetify.
Returns:
FoundPerson: The found person
"""
if not isinstance(unverified_person, UnverifiedPerson):
if unverified_person is None:
unverified_person = {}
if name:
unverified_person['name'] = name
if key:
unverified_person['_key'] = key
unverified_person = UnverifiedPerson(unverified_person)
if "is_not" in unverified_person.doc:
list_filter_isnot = [unverified_person.name].append(
unverified_person.doc["is_not"]
)
else:
list_filter_isnot = [unverified_person.name]
filter_isnot = {"name": {"$nin": list_filter_isnot}}
query_results = chroma.query(
query_texts=[unverified_person.name],
n_results=1,
where=filter_isnot,
collection="mala_persons",
)
distance = query_results["distances"][0][0]
print_purple(query_results["metadatas"][0][0]["name"], distance)
if distance > 1:
unverified_person.make_summary()
query_results = chroma.query(
query_texts=[unverified_person.summary],
n_results=1,
where=filter_isnot,
collection="mala_persons_info",
)
distance = query_results["distances"][0][0]
print_yellow(query_results["metadatas"][0][0]["name"], distance)
if distance > 1:
return None
# return unverified_person, found_person, False
print_red("NAME", query_results["documents"][0][0])
found_person = FoundPerson(
db,
name=query_results["metadatas"][0][0]["name"],
key=query_results["metadatas"][0][0]["_key"],
)
return found_person
def identify(unverified_person: Union[dict, UnverifiedPerson]):
"""
Finds and summarizes a person based on the provided person document.
Args:
person_doc (dict): The person document containing information about the person.
Returns:
dict: A dictionary containing the following keys:
- "unverified_person": An instance of the UnverifiedPerson class representing the unverified person.
- "found_person": An instance of the FoundPerson class representing the found person.
- "suggestions": A list of tuples containing suggestions and interrogation IDs.
"""
llm = LLM(small=True)
if not isinstance(unverified_person, UnverifiedPerson):
unverified_person = UnverifiedPerson(unverified_person)
found_person = find_person(unverified_person)
if not found_person:
return {
"unverified_person": unverified_person,
"found_person": None,
"suggestions": [
(None, i) for i in unverified_person.doc["mentioned_in_interrogation"]
],
}
suggestions = []
for interrogation_id in unverified_person.doc["mentioned_in_interrogation"]:
interrogation_data = db.collection("interrogations").get(interrogation_id)
text = interrogation_data["text"]
answer_prompt = f'''I texten nedan omnämns en "{unverified_person.name}" och jag försöker förstå om det kan vara exempelvis ett felstavat namn eller smeknamn för en annan person.\n
TEXT:
"""{text}"""\n
På andra ställen i polisens förundersökning finns en person som heter "{found_person.name}", och som beskrivs så här:
"""{found_person.summary}"""\n
Verkar det troligt att personen som kallas {unverified_person.name} är samma person som {found_person.name}? Svara bara JA eller NEJ, samt en kort förklaring till varför.
'''
answer = llm.generate(answer_prompt)
suggestions.append((answer, interrogation_data))
return {
"unverified_person": unverified_person,
"found_person": found_person,
"suggestions": suggestions,
}
def verify(
db,
answer=None,
unverified_person=None,
found_person=None,
interrogation_key=None,
):
"""
Verifies the answer for a person's identification in an interrogation.
Args:
db: The database object.
answer (str): The answer for the person's identification. Can be "Yes", "No", or "Unknown".
person (dict): The person's information.
person_in_arango (dict): The person's information in ArangoDB.
text (str): The text mentioning the person in the interrogation.
interrogation_key (str): The key identifying the interrogation.
Returns:
None
"""
print_blue("Answer:", answer)
# If the answer is Yes
if answer == "Yes":
unverified_person.doc["mentioned_in_interrogation"].remove(interrogation_key)
db.collection("persons").update(unverified_person.doc)
found_person.doc["confirmed"] = True
found_person.doc["info"] += found_person.doc["info"]
found_person.doc["mentioned_in_interrogation"] += ["mentioned_in_interrogation"]
print("Updated person in arango:")
print_green(
db.collection("persons").insert(found_person.doc, overwrite_mode="update")
)
if (
unverified_person.doc["mentioned_in_interrogation"] == []
and unverified_person.doc["_key"] != found_person.doc["_key"]
):
db.collection("other_persons").insert(
unverified_person.doc, overwrite_mode="update"
)
db.collection("persons").delete(unverified_person.doc, check_rev=False)
print_red(f"Removed {unverified_person.doc}")
# If the answer is No
if answer == "No":
if "is_not" not in unverified_person.doc:
unverified_person.doc["is_not"] = []
unverified_person.doc["is_not"].append([found_person.doc["name"]])
db.collection("persons").update(
unverified_person.doc, merge=True, check_rev=False
)
# If the answer is Unknown
if answer == "Unknown":
db.collection("unknown").insert(
{"name": unverified_person.name, "interrogation": interrogation_key},
overwrite=True,
)
if __name__ == "__main__":
persons = list(db.collection("persons").all())
q = "for doc in persons filter doc.other == true return doc"
other_persons = [i for i in db.aql.execute(q)]
for person in other_persons:
print(find_person(person))
exit()
# with multiprocessing.Pool() as pool:
# pool.map(find_person, other_persons)