You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
423 lines
16 KiB
423 lines
16 KiB
from _chroma import chroma |
|
from _arango import arango, db |
|
from _llm. import LLM |
|
from print_color import * |
|
import difflib |
|
import re |
|
from langchain_text_splitters import CharacterTextSplitter |
|
|
|
# text_splitter = CharacterTextSplitter( |
|
# separator="\n\n", |
|
# chunk_size=8000, |
|
# chunk_overlap=0, |
|
# length_function=len, |
|
# is_separator_regex=False, |
|
# ) |
|
|
|
|
|
class Person: |
|
def __init__(self): |
|
self.info = None |
|
self.summary = None |
|
|
|
def make_summary(self): |
|
llm = LLM(chat=False, system_prompt="Du sammanfattar information om en person utifrån ett polisförhör. Sammanfattningen ska sedan användas för att göra en sökning i en vektordatabas.") |
|
info = self.info |
|
if not self.info or all([len(self.info) < 200, 'interrogation_key' in self.doc, 'name' in self.doc]): |
|
interrogation_text = db.collection("interrogations").get(self.doc['interrogation_key'])['text'] |
|
if len(interrogation_text) > 20000: |
|
if self.doc['name'] in interrogation_text: |
|
index = interrogation_text.find(self.doc['name']) |
|
if index < 1000: |
|
interrogation_text = interrogation_text[:8000] |
|
else: |
|
interrogation_text = interrogation_text[index-1000:][:8000] |
|
|
|
prompt = f"""Nedan är ett polisförhör:\n |
|
{interrogation_text}\n |
|
Jag är intresserad av en person som omnämns som "{self.doc['name']}". Gör en detaljerad sammanfattning av informationen om {self.name}. Var noga med relationer, namn och platser. Svara ENBART med informationen om personen, ingenting annat. Svara alltid på svenska!""" |
|
info = llm.generate(prompt) |
|
if self.info: |
|
info = self.info + "\n" + info |
|
print_rainbow(f'Info about: {self.name}', info) |
|
summary_prompt = f""""Nedan är olika bitar med information om en person:\n |
|
{info}\n |
|
Sammanfatta dessa på ett detaljerat sätt. Var noga med namn, platser, händelser och relationer. |
|
Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat.""" |
|
self.summary = llm.generate(summary_prompt) |
|
|
|
|
|
class UnknownPerson(Person): |
|
def __init__(self, doc: dict): |
|
super().__init__() |
|
self.doc: dict = doc |
|
for k, v in self.doc.items(): |
|
setattr(self, k, v) |
|
if "info" in doc: |
|
self.info = "\n".join(doc["info"]) |
|
else: |
|
self.info = None |
|
if "name" in doc: |
|
self.name = doc["name"] |
|
else: |
|
self.name = "" |
|
|
|
|
|
class FoundPerson(Person): |
|
""" |
|
Represents a person found in ArangoDB. |
|
|
|
Attributes: |
|
name (str): The name of the person. |
|
info (str): Additional information about the person. |
|
key (str): A unique identifier for the person. |
|
doc (str): The persons document in ArangoDB. |
|
summary (str): A summary of the person's details. |
|
""" |
|
|
|
def __init__(self, db, name, key): |
|
super().__init__() |
|
self.name = name |
|
self.key = key |
|
self.doc = db.collection("persons").get(key) |
|
self.info = "\n".join(self.doc["info"]) |
|
|
|
|
|
class PersonIdentifier: |
|
def __init__( |
|
self, |
|
doc: dict = None, |
|
name: str = None, |
|
key: str = None, |
|
person: UnknownPerson = None, |
|
interrogation_key: str=None, |
|
text: str=None |
|
): |
|
self.doc: dict = doc |
|
self.name: str = name |
|
if 'name' in doc: |
|
self.name = doc['name'] |
|
self.key: str = key |
|
if '_key' in doc: |
|
self.key = doc['_key'] |
|
self.unknown_person: UnknownPerson = None |
|
self.found_person: FoundPerson = None |
|
self.suggestions = None |
|
self.interrogation_key = interrogation_key |
|
self.text = text |
|
|
|
self.get_unknown_person(doc, name, key, person) |
|
|
|
def get_unknown_person(self, doc, name, key, person): |
|
"""Get the unknown person.""" |
|
self.unknown_person = None |
|
self.found_person = None |
|
|
|
# Set the unknown person |
|
if person: |
|
self.unknown_person = person |
|
elif doc: |
|
self.unknown_person = UnknownPerson(doc) |
|
elif key and db.collection("persons").get(key): |
|
self.unknown_person = UnknownPerson(db.collection("persons").get(key)) |
|
else: |
|
assert key or name, "Both key and name are missing." |
|
self.unknown_person = UnknownPerson( |
|
{k: v for k, v in [("name", name), ("_key", key)] if v} |
|
) |
|
|
|
def check_name(self, text): |
|
"""Check if it's likely that person and answer_person are the same person.""" |
|
print_yellow(self.unknown_person.name, " - ", self.found_person.name) |
|
same = False |
|
|
|
# If person only has one name, first or last, compare that to first and last name of answer_person |
|
if len(self.unknown_person.name.strip().split()) == 1: |
|
llm = LLM() |
|
answer_first_name = self.found_person.name.split()[0].strip() |
|
answer_last_name = self.found_person.name.split()[-1].strip() |
|
|
|
if ( |
|
difflib.SequenceMatcher( |
|
None, self.unknown_person.name, answer_first_name |
|
).ratio() |
|
> 0.9 |
|
): |
|
if answer_last_name in text: |
|
same = True |
|
else: |
|
# Count how many time the first name appears in the first_names list |
|
first_names = [ |
|
i["name"].split()[0] for i in db.collection("persons").all() |
|
] |
|
first_name_count = first_names.count(answer_first_name) |
|
|
|
if first_name_count == 1: |
|
same = True |
|
else: |
|
llm = LLM(small=True) |
|
answer = llm.generate( |
|
f'Nämns någon med efternamnet "{answer_last_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"' |
|
) |
|
if "JA" in answer: |
|
same = True |
|
|
|
elif ( |
|
difflib.SequenceMatcher( |
|
None, self.unknown_person.name, answer_last_name |
|
).ratio() |
|
> 0.9 |
|
): |
|
if answer_first_name in text: |
|
same = True |
|
else: |
|
llm = LLM(small=True) |
|
answer = llm.generate( |
|
f'Nämns någon med förnamnet "{answer_first_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"' |
|
) |
|
if "JA" in answer: |
|
same = True |
|
|
|
else: |
|
name_similarity = difflib.SequenceMatcher( |
|
None, self.unknown_person.name, self.found_person.name |
|
).ratio() |
|
|
|
if name_similarity > 0.85: |
|
same = True |
|
|
|
return same |
|
|
|
def find_with_llm(self): |
|
if not self.unknown_person.summary: |
|
self.unknown_person.make_summary() |
|
llm = LLM(chat=True, system_prompt="Du hjälper till att ta reda på vad en person heter. Först skapar du meningar som ska användas för att söka i en vektordatabas, sedan använder du informationen du får där till att ta reda på vad personen heter. Svara alltid på svenska.") |
|
print_rainbow('Info bites:', self.unknown_person.summary) |
|
info_bites = llm.generate(f"Nedan är olika bitar med information om en person:\n\n {self.unknown_person.summary} \n\Dela upp den i 3-4 meningar där varje mening beskriver en specifik detalj om personen. Svara med en mening per rad. Svara ENBART med informationen om personen, ingenting annat.") |
|
querys = info_bites.split("\n") |
|
print_rainbow('Querys:', querys) |
|
chroma_docs = chroma.query( |
|
query_texts=querys, |
|
n_results=3, |
|
collection="mala_interrogations", |
|
) |
|
info = '' |
|
for answer in chroma_docs['documents']: |
|
for doc in answer: |
|
print_blue(doc) |
|
info += doc + "\n" |
|
|
|
prompt = f'''Nedan är en text där {self.name} nämns:\n\n{self.text}\n\nJag vill veta vem "{self.unknown_person.name}" är. Läs texten nedan för att se om du kan hitta personens fulla namn:\n |
|
{info}\n |
|
Vad heter "{self.unknown_person.name}"? Svara med förnamn och efternamn på formen "Förnamn Efternamn". Svara "None" om det inte går att säga utifrån informationen.''' |
|
print_yellow('Längd på info:', len(info)) |
|
print_rainbow('Prompt', prompt) |
|
answer = llm.generate(prompt) |
|
print_green(answer) |
|
|
|
|
|
def find_person(self): |
|
"""Finds a person in the Chroma db.""" |
|
|
|
if "is_not" in self.unknown_person.doc: |
|
list_filter_isnot = [self.unknown_person.name].append( |
|
self.unknown_person.doc["is_not"] |
|
) |
|
else: |
|
list_filter_isnot = [self.unknown_person.name] |
|
|
|
filter_isnot = {"name": {"$nin": list_filter_isnot}} |
|
|
|
query_results = chroma.query( |
|
query_texts=[self.unknown_person.name], |
|
n_results=1, |
|
where=filter_isnot, |
|
collection="mala_persons", |
|
) |
|
|
|
distance = query_results["distances"][0][0] |
|
print_purple(query_results["metadatas"][0][0]["name"], distance) |
|
|
|
if distance > 1: #! This is not really working... |
|
self.unknown_person.make_summary() |
|
query_results = chroma.query( |
|
query_texts=[self.unknown_person.summary], |
|
n_results=1, |
|
where=filter_isnot, |
|
collection="mala_persons_info", |
|
) |
|
distance = query_results["distances"][0][0] |
|
print_yellow(query_results["metadatas"][0][0]["name"], distance) |
|
if distance > 1: |
|
return None |
|
|
|
# return unknown_person, found_person, False |
|
|
|
print_blue("Name found peson:", query_results["documents"][0][0]) |
|
found_person = FoundPerson( |
|
db, |
|
name=query_results["metadatas"][0][0]["name"], |
|
key=query_results["metadatas"][0][0]["_key"], |
|
) |
|
|
|
return found_person |
|
|
|
def identify(self): |
|
|
|
llm = LLM(small=True) |
|
|
|
self.found_person = self.find_person(self.unknown_person) |
|
|
|
if not self.found_person: |
|
self.suggestions = [ |
|
(None, i) for i in self.unknown_person.doc["mentioned_in_interrogation"] |
|
] |
|
|
|
# Summarize the found persons info |
|
self.found_person.make_summary() |
|
|
|
suggestions = [] |
|
for interrogation_id in self.unknown_person.doc["mentioned_in_interrogation"]: |
|
interrogation_data = db.collection("interrogations").get(interrogation_id) |
|
text = interrogation_data["text"] |
|
|
|
answer_prompt = f'''I texten nedan omnämns en "{self.unknown_person.name}" och jag försöker förstå om det kan vara exempelvis ett felstavat namn eller smeknamn för en annan person.\n |
|
TEXT: |
|
"""{text}"""\n |
|
|
|
På andra ställen i polisens förundersökning finns en person som heter "{self.found_person.name}", och som beskrivs så här: |
|
"""{self.found_person.summary}"""\n |
|
Verkar det troligt att personen som kallas {self.unknown_person.name} är samma person som {self.found_person.name}? Svara bara JA eller NEJ, samt en kort förklaring till varför. |
|
''' |
|
answer = llm.generate(answer_prompt) |
|
suggestions.append((answer, interrogation_data)) |
|
|
|
self.suggestions = suggestions |
|
|
|
def verify( |
|
db, |
|
answer=None, |
|
unknown_person=None, |
|
found_person=None, |
|
interrogation_key=None, |
|
): |
|
""" |
|
Verifies the answer for a person's identification in an interrogation. |
|
|
|
Args: |
|
db: The database object. |
|
answer (str): The answer for the person's identification. Can be "Yes", "No", or "Unknown". |
|
person (dict): The person's information. |
|
person_in_arango (dict): The person's information in ArangoDB. |
|
text (str): The text mentioning the person in the interrogation. |
|
interrogation_key (str): The key identifying the interrogation. |
|
|
|
Returns: |
|
None |
|
""" |
|
print_blue("Answer:", answer) |
|
# If the answer is Yes |
|
if answer == "Yes": |
|
unknown_person.doc["mentioned_in_interrogation"].remove(interrogation_key) |
|
db.collection("persons").update(unknown_person.doc) |
|
|
|
found_person.doc["confirmed"] = True |
|
found_person.doc["info"] += found_person.doc["info"] |
|
found_person.doc["mentioned_in_interrogation"] += [ |
|
"mentioned_in_interrogation" |
|
] |
|
|
|
print("Updated person in arango:") |
|
print_green( |
|
db.collection("persons").insert( |
|
found_person.doc, overwrite_mode="update" |
|
) |
|
) |
|
if ( |
|
unknown_person.doc["mentioned_in_interrogation"] == [] |
|
and unknown_person.doc["_key"] != found_person.doc["_key"] |
|
): |
|
db.collection("other_persons").insert( |
|
unknown_person.doc, overwrite_mode="update" |
|
) |
|
db.collection("persons").delete(unknown_person.doc, check_rev=False) |
|
print_red(f"Removed {unknown_person.doc}") |
|
|
|
# If the answer is No |
|
if answer == "No": |
|
if "is_not" not in unknown_person.doc: |
|
unknown_person.doc["is_not"] = [] |
|
|
|
unknown_person.doc["is_not"].append([found_person.doc["name"]]) |
|
db.collection("persons").update( |
|
unknown_person.doc, merge=True, check_rev=False |
|
) |
|
|
|
# If the answer is Unknown |
|
if answer == "Unknown": |
|
db.collection("unknown").insert( |
|
{"name": unknown_person.name, "interrogation": interrogation_key}, |
|
overwrite=True, |
|
) |
|
|
|
|
|
class PersonFinder: |
|
def __init__( |
|
self, |
|
names={}, |
|
chunk_size=5000, |
|
chunk_overlap=0, |
|
separator="\n\n", |
|
): |
|
|
|
self.names = names |
|
self.llm = LLM( |
|
chat=False, |
|
small=True, |
|
system_prompt="Du är en assistent som hjälper till att hitta personer i ett polisförhör. Svara bara när personen finns i den del du får, hitta inte på personer.", |
|
) |
|
self.text_splitter = CharacterTextSplitter( |
|
separator="\n\n", |
|
chunk_size=chunk_size, |
|
chunk_overlap=chunk_overlap, |
|
length_function=len, |
|
is_separator_regex=False, |
|
) |
|
|
|
def extract_names(self, chunk, extra_prompt=""): |
|
|
|
chunk_names = [] |
|
# Find persons in the text |
|
prompt = f'''Jag vill hitta alla personer som nämns i texten nedan:\n |
|
"""{chunk}"""\n |
|
Vilka personer nämns i texten? Svara ENBART med en pythonformaterad lista av namn. |
|
Exempel på svar för att du ska förstå formen: |
|
<exempel> |
|
[namn1, namn2, namn3]. |
|
</exempel |
|
Var noga med att svara |
|
{extra_prompt}''' |
|
response = self.llm.generate(prompt) |
|
response = re.sub(r"[^a-zA-ZåäöÅÄÖ\- ,]", "", response).replace(" namn ", "") |
|
|
|
for name in [i.strip() for i in response.split(",") if len(i) > 2]: |
|
same_name = False |
|
if name not in chunk_names and name not in self.names: |
|
if self.names != []: |
|
for n in list(self.names): |
|
if name in n: |
|
same_name = True |
|
self.names[name] = self.names[n] |
|
|
|
if not same_name: |
|
chunk_names.append(name) |
|
|
|
return chunk_names |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
text = db.collection('rumors').get('Mikael_Sjostrom_2023-02-13_p.98') |
|
person = PersonIdentifier( |
|
doc={'name': 'Douglas', 'interrogation_key': "_'Larsson',_'_Neo'__2023-02-15_p.208"}) |
|
person.find_with_llm() |