Refactor print_color functions to accept multiple arguments

main
lasseedfast 2 years ago
parent 5cc6b45673
commit cecd4c0f12
  1. 6
      Malå.py
  2. 95
      check_names_app.py
  3. 297
      extract_persons.py
  4. 199
      extract_rumors.py
  5. 137
      identify_person.py
  6. 20
      print_color.py
  7. 33
      viz_mentions.py
  8. 67
      viz_rumors.py

@ -0,0 +1,6 @@
import streamlit as st
st.set_page_config(
page_title="Malå",
)

@ -1,95 +0,0 @@
import streamlit as st
from identify_person import find_person, verify
from _arango import arango
import re
from time import time
@st.cache_data()
def get_persons():
return list(arango.db.collection('persons').all())
@st.cache_data()
def get_other_persons():
q = 'for doc in persons filter doc.other == true return doc'
return [i for i in db.aql.execute(q)]
@st.cache_data()
def get_suggestions(person):
print('Finding person', person)
with st.spinner('Hämtar data...'):
answers = find_person(person)
print('ANSWERS',answers)
return answers
db = arango.db
if 'persons' not in st.session_state:
st.session_state.persons = get_persons()
st.session_state.persons_names = [i['name'] for i in st.session_state.persons]
st.session_state.persons_dict = {i['name']: i['_key'] for i in st.session_state.persons}
if 'other_persons' not in st.session_state:
st.session_state.other_persons = get_other_persons()
if 'radio1' in st.session_state:
del st.session_state['radio1']
if 'suggestions' not in st.session_state:
st.session_state.suggestions = []
# If suggestions are all used up
if st.session_state.suggestions == []:
st.session_state.other_person = st.session_state.other_persons.pop(0)
suggestions = get_suggestions(st.session_state.other_person)
print(suggestions)
if suggestions == None:
st.rerun()
else:
st.session_state.suggestions = suggestions
st.session_state.suggestion = st.session_state.suggestions.pop(0)
answer, person_in_arango, interrogation_doc, other_person, found_person, found_person_info, person = st.session_state.suggestion
text = interrogation_doc['text']
answer = answer.replace('\n', ' ')
st.markdown(f'Är :blue[{other_person}] samma som :blue[{found_person}]?')
st.write(f'(från förhör med {interrogation_doc["person"]})')
if 'JA' in answer:
st.markdown(f'🤖\n:green[{answer}]')
radio_index = 0
elif 'NEJ' in answer:
radio_index = 1
st.markdown(f'🤖\n:red[{answer}]')
else:
radio_index = None
st.markdown(f'🤖\n{answer}')
with st.form('Form'):
with st.expander('Mer information'):
text = re.sub(r'\n\n+', '\n', text)
text = text.replace('\n', '<br>')
st.markdown(f'##### Förhöret:\n{text}', unsafe_allow_html=True)
st.markdown(f'##### {found_person}:')
st.markdown(found_person_info, unsafe_allow_html=True)
user_choice = st.radio('', ('Ja', 'Nej', 'Vet ej'), key=time(), index=radio_index)
other_choice = st.selectbox('Välj alla som stämmer', st.session_state.persons_names, placeholder='Sök en annan', index=None, key=time() + 'multiselect')
st.form_submit_button('Nästa')
if other_choice:
person_in_arango = db.collection('persons').get(st.session_state.persons_dict[other_choice])
print('Yes', person, person_in_arango, text, db)
#verify('Yes', person, person_in_arango, text, db)
elif user_choice == 'Ja':
print(('Yes', person, person_in_arango))
#verify('Yes', person, person_in_arango, text, db)
elif user_choice == 'Nej':
pass
elif user_choice == 'Vet ej':
pass
#verify('Unknown', person, person_in_arango, text, db)

@ -8,14 +8,66 @@ import random
from time import sleep from time import sleep
import traceback import traceback
from pprint import pprint from pprint import pprint
from print_color import *
def check_name(name, persons):
valid_name = True def check_name(person, answer_person, text):
name_parts = name.split() print_yellow(person, " - ", answer_person)
if len(name_parts) > 1: same = False
if name_parts[0].istitle() and name_parts[1].istitle(): # If full name similarity is below a threshold (e.g., 0.5), compare first names only
valid_name = False
return valid_name # If person only has one name, first or last, compare that to first and last name of answer_person
if len(person.strip().split()) == 1:
llm = LLM()
answer_first_name = answer_person.split()[0].strip()
answer_last_name = answer_person.split()[-1].strip()
if difflib.SequenceMatcher(None, person, answer_first_name).ratio() > 0.9:
if answer_last_name in text:
same = True
else:
# Count how many time the first name appears in the first_names list
first_names = [
i["name"].split()[0] for i in db.collection("persons").all()
]
first_name_count = first_names.count(answer_first_name)
if first_name_count == 1:
same = True
else:
answer = llm.generate(
f'Nämns någon med efternamnet "{answer_last_name}" i texten nedan?\n\n"""{text[:6000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"'
)
if "JA" in answer:
same = True
elif difflib.SequenceMatcher(None, person, answer_last_name).ratio() > 0.9:
if answer_first_name in text:
same = True
else:
answer = llm.generate(
f'Nämns någon med förnamnet "{answer_first_name}" i texten nedan?\n\n"""{text[:6000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"'
)
if "JA" in answer:
same = True
else:
name_similarity = difflib.SequenceMatcher(None, person, answer_person).ratio()
print("Similarity:", name_similarity)
# person_first_name = person.split()[0]
# answer_person_first_name = answer_person.split()[0]
# first_name_similarity = difflib.SequenceMatcher(
# None, person_first_name, answer_person_first_name
# ).ratio()
# person_last_name = person.split()[-1]
# answer_person_last_name = answer_person.split()[-1]
# print("new:", name_similarity)
if name_similarity > 0.9:
same = True
return same
def execute_query_with_retry(db, query, max_retries=5, delay=2): def execute_query_with_retry(db, query, max_retries=5, delay=2):
@ -32,16 +84,26 @@ def execute_query_with_retry(db, query, max_retries=5, delay=2):
# If we've exhausted all retries, re-raise the last exception # If we've exhausted all retries, re-raise the last exception
raise raise
# Then, in your extract_persons function: # Then, in your extract_persons function:
def extract_persons(interrogation): def extract_persons(interrogation):
known_persons = {
"Douglas": "Douglas Bengtsson",
"Rashid": "Rashid Sheiksaid",
"Emanuel": "Emanuel Johansson",
"Robert": "Robert Bengtsson",
}
sleep(random.uniform(0.05, 0.3)) sleep(random.uniform(0.05, 0.3))
print(interrogation["_key"].upper()) print("INTERROGATION:", interrogation["_key"])
q = "for doc in persons filter doc.other != true return doc" q = "for doc in persons filter doc.other != true return doc"
result = execute_query_with_retry(db, q) result = execute_query_with_retry(db, q)
persons_docs = list(result) persons_docs = list(result)
persons = [i["name"].strip() for i in persons_docs] persons = [i["name"].strip() for i in persons_docs]
first_names = {i["name"].split()[0].strip(): i["name"] for i in persons_docs}
persons_dict = {i["name"]: i for i in persons_docs} persons_dict = {i["name"]: i for i in persons_docs}
@ -67,130 +129,144 @@ def extract_persons(interrogation):
Jag är inte intresserad av förhörsledaren eller personen som förhörs.''' Jag är inte intresserad av förhörsledaren eller personen som förhörs.'''
response = llm.generate(prompt) response = llm.generate(prompt)
response = re.sub(r"[^a-zA-ZåäöÅÄÖ\- ,]", "", response).replace(" namn ", "") response = re.sub(r"[^a-zA-ZåäöÅÄÖ\- ,]", "", response).replace(" namn ", "")
print(response)
for name in [i.strip() for i in response.split(",") if len(i) > 2]: for name in [i.strip() for i in response.split(",") if len(i) > 2]:
if name not in names: if name not in names:
names.append(name) names.append(name)
for name in names: for name in names:
try: # Compare the person to a list of known persons
# Compare the person to a list of known persons prompt = f'''Jag vill veta vem {name} är. Kolla på förhöret nedan och svara om du hittar något om personen där.
prompt = f'''Jag vill veta vem {name} är. Kolla på förhöret nedan och svara om du hittar något om personen där. """{chunk}"""\n
"""{chunk}"""\n Vem är {name}? Svara bara med sådant som finns i texten.'''
Vem är {name}? Svara bara med sådant som finns i texten.''' info = llm.generate(prompt)
info = llm.generate(prompt) person = None
person = None # Reverse name
# Reverse name if name in persons:
if name in persons: person = persons_dict[name]
person = persons_dict[name]
elif name.split().reverse() in persons: elif name in known_persons:
print("Vände och hittade ✌", name.split().reverse()) person = persons_dict[known_persons[name]]
person = persons_dict[name.split().reverse()]
else: elif name.split().reverse() in persons:
closest_matchs = difflib.get_close_matches(name, persons, n=8) print("Vände och hittade ✌", name.split().reverse())
persons_string = "\n".join(closest_matchs) person = persons_dict[name.split().reverse()]
prompt = f"""Jag behöver identifiera {name}. Nedan är en lista på personer det kanske skulle kunna vara:\n else:
{persons_string}\n
Är {name} någon av dessa personer? Ofta står personen bara med sitt förnamn eller efternamn, men försök att lista ut om det är någon av personerna ovan. Namnet kan också vara felstavat, men inte ett helt annat namn. closest_matches = difflib.get_close_matches(name, persons, n=4, cutoff=0.3)
Svara BARA med namnet personen ur listan, och bara om du är helst säker att det verkligen är samma person. Är du inte säker svara "None".""" if name.split()[0] in first_names:
answer_person = llm.generate(prompt) if first_names[name.split()[0]] not in closest_matches:
closest_matches.append(first_names[name.split()[0]])
if answer_person in persons and check_name(name, persons): persons_string = "\n".join(closest_matches)
person = persons_dict[answer_person] prompt = f"""Jag behöver identifiera {name}. Nedan är en lista på personer det kanske skulle kunna vara:\n
else: {persons_string}\n
q = f"for doc in persons return {{'name': doc['name'], 'info': doc['info']}}" Är {name} någon av dessa personer? I texten kan personen stå med bara sitt förnamn eller efternamn, kolla speciellt efter namn i listan där förnamnet eller efternamnet stämmer. Namnet kan också vara felstavat, men inte ett helt annat namn.
persons_arango_docs = list(db.aql.execute(q)) Svara BARA med namnet personen ur listan. Är du inte säker svara "None"."""
answer_person = llm.generate(prompt)
persons_with_info = [
f"{name} - {info}" if answer_person in persons and check_name(
for i in persons_arango_docs name, answer_person, interrogation["text"]
for name, info in i.items() ):
] person = persons_dict[answer_person]
persons_with_info_string = "\n".join(persons_with_info)
prompt = f"""Här är mer information om möjliga personer:\n if person:
{persons_with_info_string}\n print_green(f'{name} identified: {person["name"]}', "\n")
Försök att utifrån informationen ovan samt förhöret du tagit del av i tidigare meddelanden identifiera {name}.
Svara BARA med namnet personen ur listan, och bara om du är helst säker att det verkligen är samma person. Är du inte säker svara "None".""" if "info" not in person:
person["info"] = []
answer_person = llm.generate(prompt) if info not in person["info"]:
person["info"].append(info)
if answer_person in persons and check_name(name, persons): if "mentioned_in_interrogation" not in person:
person = persons_dict[answer_person] person["mentioned_in_interrogation"] = []
if interrogation["_key"] not in person["mentioned_in_interrogation"]:
if person: person["mentioned_in_interrogation"].append(interrogation["_key"])
print(f"\033[92m{name} found in database: {person['name']}\033[0m") if "mentioned_as" not in person:
person["mentioned_as"] = []
if "info" not in person: if {name: interrogation["_key"]} not in person["mentioned_as"]:
person["info"] = [] person["mentioned_as"].append({name: interrogation["_key"]})
if info not in person["info"]: db.collection("persons").update(person, check_rev=False)
person["info"].append(info) # db.collection("all_relations").insert(
if "mentioned_in_interrogation" not in person: # {
person["mentioned_in_interrogation"] = [] # "_from": interrogation["person_id"],
if interrogation["_key"] not in person["mentioned_in_interrogation"]: # "_to": person["_id"],
person["mentioned_in_interrogation"].append(interrogation["_key"]) # "relation": "mentioned_by",
db.collection("persons").update(person, check_rev=False) # "mentions": [{'interrogation': interrogation["_key"], "date": interrogation["date"], "mentioned_as": info}],
# db.collection("all_relations").insert( # "_key": f'{interrogation["_key"]}_{person["_key"]}'
# { # },
# "_from": interrogation["person_id"], # overwrite_mode="update",
# "_to": person["_id"], # merge=True,
# "relation": "mentioned_by", # )
# "mentions": [{'interrogation': interrogation["_key"], "date": interrogation["date"], "mentioned_as": info}], else:
# "_key": f'{interrogation["_key"]}_{person["_key"]}' print(f"\033[91m{name} not identified\033[0m")
# }, print_yellow(
# overwrite_mode="update", "\n".join([f"- {i}" for i in persons_string.split("\n")]), "\n"
# merge=True, )
# ) print()
else:
print(f"\033[91m{name} not found in database\033[0m")
_key = arango.fix_key_name(name) _key = arango.fix_key_name(name) #TODO Are there multiple persons with the same name?
doc = db.collection("persons").insert(
{ # If no confirmed person was identified, create a new person or add to another unconfirmed person
if not db.collection("persons").get(_key):
if db.collection("persons").get(_key):
doc = db.collection("persons").get(_key)
else:
doc = {
"_key": _key, "_key": _key,
"name": name, "name": name,
"info": info, "info": [info],
"other": True, "other": True,
"confirmed": False,
"mentioned_in_interrogation": [interrogation["_key"]], "mentioned_in_interrogation": [interrogation["_key"]],
}, }
overwrite_mode="update",
merge=True,
)
# db.collection("all_relations").insert( else:
# { doc = db.collection("persons").get(_key)
# "_from": interrogation["person_id"],
# "_to": doc["_id"], if interrogation["_key"] not in doc["mentioned_in_interrogation"]:
# "relation": "mentioned_by", doc["mentioned_in_interrogation"].append(interrogation["_key"])
# 'other': True, if info not in doc["info"]:
# "mentions": [{'interrogation': interrogation["_key"], "date": interrogation["date"], "mentioned_as": info}], doc["info"].append(info)
# "_key": f'{interrogation["_key"]}_{doc["_key"]}' db.collection("persons").insert(doc, merge=False, overwrite_mode='update')
# },
# overwrite_mode="update", # db.collection("all_relations").insert(
# merge=True, # {
# ) # "_from": interrogation["person_id"],
except Exception as e: # "_to": doc["_id"],
traceback.print_exc() # "relation": "mentioned_by",
print(f"\033[91mError when processing {name}: {e}\033[0m") # 'other': True,
# "mentions": [{'interrogation': interrogation["_key"], "date": interrogation["date"], "mentioned_as": info}],
# "_key": f'{interrogation["_key"]}_{doc["_key"]}'
# },
# overwrite_mode="update",
# merge=True,
# )
if __name__ == "__main__": if __name__ == "__main__":
db = arango.db db = arango.db
q = "for doc in interrogations return doc" q = 'for doc in interrogations filter doc._key == "Markus_Lindahl_2023-02-20_p.93" return doc'
interrogations = list(db.aql.execute(q)) interrogations = list(db.aql.execute(q))
interrogations.sort(key=lambda x: x["date"]) interrogations.sort(key=lambda x: x["date"])
persons = list(db.collection("persons").all()) persons = list(db.collection("persons").all())
interrogations_done = [] interrogations_done = []
for person in persons: for person in persons:
if "mentioned_in_interrogation" in person and person["mentioned_in_interrogation"]: if (
"mentioned_in_interrogation" in person
and person["mentioned_in_interrogation"]
):
for interrogation in person["mentioned_in_interrogation"]: for interrogation in person["mentioned_in_interrogation"]:
interrogations_done.append(interrogation) interrogations_done.append(interrogation)
# interrogations = [
interrogations = [interrogation for interrogation in interrogations if interrogation["_key"] not in set(interrogations_done)] # interrogation
print("Number of interrogations to process:", len(interrogations)) # for interrogation in interrogations
# if interrogation["_key"] not in set(interrogations_done)
# ]
# print("Number of interrogations to process:", len(interrogations))
# q = 'for doc in interrogations return doc' # q = 'for doc in interrogations return doc'
# interrogations = list(db.aql.execute(q)) # interrogations = list(db.aql.execute(q))
@ -201,7 +277,8 @@ if __name__ == "__main__":
# interrogations = [interrogation for interrogation in interrogations if interrogation['_key'] not in rumors] # interrogations = [interrogation for interrogation in interrogations if interrogation['_key'] not in rumors]
# print('Number of interrogations to process:', len(interrogations)) # print('Number of interrogations to process:', len(interrogations))
# for interrogation in interrogations: for interrogation in interrogations:
# extract_persons(interrogation) extract_persons(interrogation)
with multiprocessing.Pool() as pool: exit()
with multiprocessing.Pool(processes=3) as pool:
pool.map(extract_persons, interrogations) pool.map(extract_persons, interrogations)

@ -2,16 +2,17 @@ import re
from _arango import arango from _arango import arango
from _llm import LLM from _llm import LLM
from pprint import pprint from pprint import pprint
from pprint import pprint
from langchain_text_splitters import CharacterTextSplitter from langchain_text_splitters import CharacterTextSplitter
import multiprocessing import multiprocessing
from print_color import *
class Interrogation: class Interrogation:
def __init__(self, interrogation): def __init__(self, interrogation):
self.interrogation = interrogation self.interrogation = interrogation
self.llm = LLM(chat=True) self.llm = LLM(chat=True)
self.llm_checker = LLM(chat=False) self.llm_checker = LLM(chat=False)
self.text = interrogation['text'] self.text = interrogation["text"]
# Info to collect # Info to collect
self.sexual_content = None self.sexual_content = None
@ -25,7 +26,7 @@ class Interrogation:
self.heard_from = None self.heard_from = None
self.sexual_chunk = None self.sexual_chunk = None
self.sexual_summary = None self.sexual_summary = None
self.self_heard_from_id = None self.heard_from_id = None
self.text_splitter = CharacterTextSplitter( self.text_splitter = CharacterTextSplitter(
separator="\n\n", separator="\n\n",
@ -33,21 +34,35 @@ class Interrogation:
chunk_overlap=0, chunk_overlap=0,
length_function=len, length_function=len,
is_separator_regex=False, is_separator_regex=False,
) )
self.chunks = self.text_splitter.split_text(self.text) self.chunks = self.text_splitter.split_text(self.text)
if 'mentioned_persons' in interrogation: if (
q = f'''for doc in persons filter doc._id in ["{'","'.join(interrogation["mentioned_persons"])}"] return doc''' "mentioned_persons" in interrogation
print(q) and interrogation["mentioned_persons"] != []
self.mentioned_persons = list(arango.db.aql.execute(q)) ):
self.mentioned_in_interrogation = [i['name'] for i in self.mentioned_persons] self.mentioned_persons = interrogation["mentioned_persons"]
self.mentioned_in_interrogations_dict = {i['name']: i['_id'] for i in self.mentioned_persons} self.mentioned_persons = list(
print(self.mentioned_in_interrogation) db.aql.execute(
"for doc in persons filter doc._key in @keys return doc",
bind_vars={"keys": self.mentioned_persons},
)
)
self.mentioned_persons_dict = {}
for person in self.mentioned_persons:
mentioned_as_name = None
if "mentioned_as" in person:
for i in person["mentioned_as"]:
name, interrogation_key = list(i.items())[0]
if interrogation_key == self.interrogation["_key"]:
mentioned_as_name = name
if not mentioned_as_name:
mentioned_as_name = person["name"]
self.mentioned_persons_dict[mentioned_as_name] = person["_key"]
else: else:
self.mentioned_in_interrogation = None self.mentioned_persons = None
self.mentioned_persons_dict = None
def find_sexual_content(self, chunk, check_text=False): def find_sexual_content(self, chunk, check_text=False):
prompt = f''' prompt = f'''
@ -72,9 +87,9 @@ class Interrogation:
else: else:
response = self.llm.generate(prompt) response = self.llm.generate(prompt)
if 'JA' in response: if "JA" in response:
sexual_content = True sexual_content = True
elif 'NEJ' in response: elif "NEJ" in response:
sexual_content = False sexual_content = False
else: else:
@ -87,97 +102,102 @@ class Interrogation:
if sexual_content: if sexual_content:
self.sexual_chunk = chunk self.sexual_chunk = chunk
prompt = f'''Beskriv det sexuella innehållet i förhöret.''' prompt = f"""Beskriv det sexuella innehållet i förhöret."""
self.sexual_content_description = self.llm.generate(prompt) self.sexual_content_description = self.llm.generate(prompt)
self.extract_sexual_info(chunk) self.extract_sexual_info(chunk)
prompt = f'''Ungefär när i tiden hände det som personen berättar om?''' prompt = f"""Ungefär när i tiden hände det som personen berättar om?"""
self.sexual_content_date = self.llm.generate(prompt) self.sexual_content_date = self.llm.generate(prompt)
def find_self_experience(self): def find_self_experience(self):
prompt = f'Har personen som förhörs själv varit med om något av det som beskrivs? Svara ENBART med "JA" eller "NEJ".' prompt = f'Har personen som förhörs själv varit med om något av det som beskrivs? Svara ENBART med "JA" eller "NEJ".'
response = self.llm.generate(prompt) response = self.llm.generate(prompt)
if 'JA' in response: if "JA" in response:
self.self_experience = True self.self_experience = True
elif 'NEJ' in response: elif "NEJ" in response:
self.self_experience = False self.self_experience = False
else: else:
self.self_experience = None self.self_experience = None
def find_self_involvement(self): def find_self_involvement(self):
prompt = f'Har personen som förhörs själv varit inblandad på något sätt? Svara ENBART med "JA" eller "NEJ".' prompt = f'Har personen som förhörs själv varit inblandad på något sätt? Svara ENBART med "JA" eller "NEJ".'
response = self.llm.generate(prompt) response = self.llm.generate(prompt)
if 'JA' in response: if "JA" in response:
self.self_involvement = True self.self_involvement = True
prompt = f'''På vilket sätt har personen som förhörs varit inblandad?''' prompt = f"""På vilket sätt har personen som förhörs varit inblandad?"""
self.self_involvement_type = self.llm.generate(prompt) self.self_involvement_type = self.llm.generate(prompt)
elif 'NEJ' in response: elif "NEJ" in response:
self.self_involvement = False self.self_involvement = False
else: else:
self.self_involvement = None self.self_involvement = None
def find_heard_about(self): def find_heard_about(self):
prompt = f'''Har personen hört talas om något av det som beskrivs? Svara ENBART med "JA" eller "NEJ".''' prompt = f"""Har personen hört talas om något av det som beskrivs? Svara ENBART med "JA" eller "NEJ"."""
response = self.llm.generate(prompt) response = self.llm.generate(prompt)
if 'JA' in response: if "JA" in response:
self.heard_about = True self.heard_about = True
self.find_heard_from() self.find_heard_from()
elif 'NEJ' in response: elif "NEJ" in response:
self.heard_about = False self.heard_about = False
else: else:
self.heard_about = None self.heard_about = None
def find_heard_from(self): def find_heard_from(self):
prompt = f'Av vem har personen hört det som beskrivs? Svara bara med namnet på personen, eller vad personen kallas.' if self.mentioned_persons_dict:
list_name = "\n".join(self.mentioned_persons_dict.keys())
heard_from = self.llm.generate(prompt) prompt = f"Av vem har personen hört det som beskrivs? Är det av någon av personerna nedan?\n\n{list_name}\n\nSvara ENBART med namnet på personen. Om inte det inte är någon av personerna i listan, svara bara None."
answer = self.llm.generate(prompt)
if self.mentioned_in_interrogation: print_blue("Hört av:", answer)
mentioned_in_interrogation = '\n'.join(self.mentioned_in_interrogation)
prompt = f'''Jag behöver identifiera vem {heard_from} är, verkar {heard_from} vara någon av följande personer:\n
{mentioned_in_interrogation} if answer in self.mentioned_persons_dict:
print_green("I DB:", self.mentioned_persons_dict[answer])
self.heard_from = answer
self.heard_from_id = "persons/" + self.mentioned_persons_dict[answer]
Svara ENBART med med namnet personen det skulle kunna vara. Om du inte vet svara "Jag vet inte".
'''
heard_from_answer = self.llm.generate(prompt)
if heard_from_answer in self.mentioned_in_interrogation:
self.heard_from = heard_from_answer
self.self_heard_from_id = self.mentioned_in_interrogations_dict[heard_from_answer]
else:
mentioned_info = '\n\n'.join([f'{i["name"].upper()}\n{i["info"]}' for i in self.mentioned_persons])
prompt = f'''Här är mer information om möjliga personer:\n
{mentioned_info}\n
Kan du utifrån den säga vem {heard_from} är? Svara BARA med namnet personen ur listanÄr du inte säker svara "Jag vet inte".
'''
heard_from_answer_info = self.llm.generate(prompt)
if heard_from_answer_info in self.mentioned_in_interrogation:
self.heard_from = heard_from_answer_info
self.self_heard_from_id = self.mentioned_in_interrogations_dict[heard_from_answer_info]
if not self.heard_from: if not self.heard_from:
self.heard_from = heard_from prompt = f"Av vem har personen i så fall hört det som beskrivs? Svara bara med namnet på personen, eller vad personen kallas."
self.heard_from = self.llm.generate(prompt)
print_yellow("Hört av:", self.heard_from)
if self.mentioned_persons:
mentioned_persons_with_info = db.aql.execute(
'for doc in persons filter doc._key in @keys return {"name":doc.name, "info":doc.info}',
bind_vars={"keys": self.mentioned_persons},
)
mentioned_info = "\n\n".join(
[
f'{i["name"].upper()}\n{i["info"]}'
for i in mentioned_persons_with_info
]
)
prompt = f"""Här är mer information om möjliga personer:\n
{mentioned_info}\n
Kan du utifrån den säga vem {self.heard_from} är? Svara BARA med namnet personen ur listanÄr du inte säker svara "Jag vet inte".
"""
heard_from_answer_info = self.llm.generate(prompt)
if heard_from_answer_info in self.mentioned_persons:
self.heard_from = heard_from_answer_info
self.heard_from_id = self.mentioned_persons_dict[heard_from_answer_info]
def create_arango_doc(self): def create_arango_doc(self):
return { return {
'_key': self.interrogation['_key'], "_key": self.interrogation["_key"],
'sexual_content': self.sexual_content, "sexual_content": self.sexual_content,
'sexual_content_description': self.sexual_content_description, "sexual_content_description": self.sexual_content_description,
'self_experience': self.self_experience, "self_experience": self.self_experience,
'self_involvement': self.self_involvement, "self_involvement": self.self_involvement,
'self_involvement_type': self.self_involvement_type, "self_involvement_type": self.self_involvement_type,
'heard_about': self.heard_about, "heard_about": self.heard_about,
'heard_from': self.heard_from, "heard_from": self.heard_from,
'interrogation_date': self.interrogation['date'], "heard_from_id": self.heard_from_id,
'sexual_content_date': self.sexual_content_date, "interrogation_key": self.interrogation["_key"],
'sexual_info': '\n'.join(self.sexual_info), "interrogation_date": self.interrogation["date"],
'sexual_summary': self.sexual_summary, "sexual_content_date": self.sexual_content_date,
} "sexual_info": "\n".join(self.sexual_info),
"sexual_summary": self.sexual_summary,
}
def extract_sexual_info(self, chunk): def extract_sexual_info(self, chunk):
@ -206,25 +226,31 @@ class Interrogation:
chunk = self.sexual_chunk chunk = self.sexual_chunk
index_of_chunk = self.chunks.index(chunk) index_of_chunk = self.chunks.index(chunk)
if index_of_chunk != len(self.chunks)-1: if index_of_chunk != len(self.chunks) - 1:
remaining_chunks = self.chunks[index_of_chunk+1:] remaining_chunks = self.chunks[index_of_chunk + 1 :]
for chunk in remaining_chunks: for chunk in remaining_chunks:
self.extract_sexual_info(chunk) self.extract_sexual_info(chunk)
sexual_info_string = '\n'.join(self.sexual_info) sexual_info_string = "\n".join(self.sexual_info)
prompt = f'Nedan är innehåll som samlats in ur förhöret:\n\n"""{sexual_info_string}"""\n\nSammanfatta innehållet på ett detaljerat vis.' prompt = f'Nedan är innehåll som samlats in ur förhöret:\n\n"""{sexual_info_string}"""\n\nSammanfatta innehållet på ett detaljerat vis.'
self.sexual_summary = self.llm.generate(prompt) self.sexual_summary = self.llm.generate(prompt)
def add_to_arango(self): def add_to_arango(self):
arango_doc = self.create_arango_doc() arango_doc = self.create_arango_doc()
db.collection('rumors').insert(arango_doc, overwrite=True, keep_none=False) db.collection("rumors").insert(arango_doc, overwrite=True, keep_none=False)
if arango_doc['sexual_content']:
pprint(arango_doc)
def process_interrogation(interrogation_data): def process_interrogation(interrogation_data):
"""
Process an interrogation by analyzing its content for sexual content and storing it in ArangoDB.
Args:
interrogation_data (dict): The data of the interrogation.
Returns:
None
"""
interrogation = Interrogation(interrogation_data) interrogation = Interrogation(interrogation_data)
for chunk in interrogation.chunks: for chunk in interrogation.chunks:
@ -253,18 +279,21 @@ def process_interrogation(interrogation_data):
if __name__ == "__main__": if __name__ == "__main__":
db = arango.db db = arango.db
q = 'for doc in interrogations return doc' q = "for doc in interrogations return doc"
interrogations = list(db.aql.execute(q)) interrogations = list(db.aql.execute(q))
# Filter out interrogations that have their _key in the rumors collection # Filter out interrogations that have their _key in the rumors collection
q = 'for rumor in rumors return rumor._key' q = "for rumor in rumors return rumor._key"
rumors = list(db.aql.execute(q)) rumors = list(db.aql.execute(q))
interrogations = [interrogation for interrogation in interrogations if interrogation['_key'] not in rumors] interrogations = [
print('Number of interrogations to process:', len(interrogations)) interrogation
for interrogation in interrogations
for i in interrogations: if interrogation["_key"] not in rumors
process_interrogation(i) ]
exit() print("Number of interrogations to process:", len(interrogations))
with multiprocessing.Pool() as pool:
# for i in interrogations:
# process_interrogation(i)
# exit()
with multiprocessing.Pool(3) as pool:
pool.map(process_interrogation, interrogations) pool.map(process_interrogation, interrogations)

@ -5,28 +5,6 @@ from pprint import pprint
from print_color import * from print_color import *
import multiprocessing import multiprocessing
def add_persons_to_chroma():
db = arango.db
q = "for doc in persons filter doc.other != true return doc"
persons = list(db.aql.execute(q))
# Lists to store the documents, metadatas and ids
documents = []
metadatas = []
ids = []
for person in persons:
if 'info' in person:
info = '\n'.join(person['info'])
documents.append(person['name'])
#documents.append(f"{person['name']}\n{info}")
metadata = {'name': person['name'], '_key': person['_key']}
metadatas.append(metadata)
ids.append(person["_key"])
collection = chroma.client.get_collection('mala_persons')
collection.add(documents=documents, metadatas=metadatas, ids=ids)
def find_person(person): def find_person(person):
""" """
@ -48,32 +26,38 @@ def find_person(person):
db = arango.db db = arango.db
llm = LLM() llm = LLM()
other_person = person['name'] other_person = person["name"]
chroma = ChromaDB() chroma = ChromaDB()
col = chroma.client.get_or_create_collection('mala_persons') col = chroma.client.get_or_create_collection("mala_persons")
hits = col.query(query_texts=[other_person], n_results=1) if "is_not" not in other_person:
filter_isnot = {}
else:
filter_isnot = {"name": {"$nin": other_person["is_not"]}}
found_person = hits['documents'][0][0] # Do a query to find the person
found_person_key = hits['metadatas'][0][0]['_key'] hits = col.query(query_texts=[other_person], n_results=1, where=filter_isnot)
distance = hits['distances'][0][0]
#* Filter out hits with distance > 1 found_person = hits["documents"][0][0]
found_person_key = hits["metadatas"][0][0]["_key"]
distance = hits["distances"][0][0]
# * Filter out hits with distance > 1
if distance > 1: if distance > 1:
return None return []
found_person_in_arango = db.collection('persons').get(found_person_key) found_person_in_arango = db.collection("persons").get(found_person_key)
found_person_info = '\n'.join(found_person_in_arango['info']) found_person_info = "\n".join(found_person_in_arango["info"])
prompt = f'Nedan är olika bitar med information om en person:\n\n{found_person_info}\n\nSammanfatta dessa på ett detaljerat sätt, var noga med namn, platser, händelser och relationer. Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat. ' prompt = f"Nedan är olika bitar med information om en person:\n\n{found_person_info}\n\nSammanfatta dessa på ett detaljerat sätt, var noga med namn, platser, händelser och relationer. Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat. "
person_in_arango_summary = llm.generate(prompt) person_in_arango_summary = llm.generate(prompt)
# Write summary about the person # Write summary about the person
interrogations = person['mentioned_in_interrogation'] interrogations = person["mentioned_in_interrogation"]
output = [] output = []
for interrogation in interrogations: for interrogation in interrogations:
interrogation_doc = db.collection('interrogations').get(interrogation) interrogation_doc = db.collection("interrogations").get(interrogation)
text = interrogation_doc['text'] text = interrogation_doc["text"]
prompt = f'''I texten nedan omnämns en "{other_person}" och jag försöker förstå om det kan vara exempelvis ett felstavat namn eller smeknamn för en annan person.\n prompt = f'''I texten nedan omnämns en "{other_person}" och jag försöker förstå om det kan vara exempelvis ett felstavat namn eller smeknamn för en annan person.\n
TEXT: TEXT:
@ -86,27 +70,82 @@ def find_person(person):
# Om istället förnamnet eller efternamnet är helt olika så är det förmodligen inte samma person.Om det bara är ett namn (inget efternamn) kan det också handla om ett smeknamn eller en beskrivning. # Om istället förnamnet eller efternamnet är helt olika så är det förmodligen inte samma person.Om det bara är ett namn (inget efternamn) kan det också handla om ett smeknamn eller en beskrivning.
answer = llm.generate(prompt) answer = llm.generate(prompt)
output.append((answer, found_person_in_arango, interrogation_doc, other_person, found_person, found_person_info, person)) output.append(
(
answer,
found_person_in_arango,
interrogation_doc,
other_person,
found_person,
found_person_info,
person,
)
)
return output return output
def verify(answer, person, person_in_arango, text, db):
if answer == 'Yes':
person['mentioned_in_interrogation'].remove(text)
db.collection('persons').update(person)
person_in_arango['info'] += person['info']
person_in_arango['mentioned_in_interrogation'] += ['mentioned_in_interrogation']
db.collection('persons').update(person_in_arango)
db.collection('other_persons').insert(person, overwrite=True) def verify(
db.collection('persons').delete(person, check_rev=False) db,
answer=None,
person=None,
person_in_arango=None,
interrogation_key=None,
):
"""
Verifies the answer for a person's identification in an interrogation.
Args:
db: The database object.
answer (str): The answer for the person's identification. Can be "Yes", "No", or "Unknown".
person (dict): The person's information.
person_in_arango (dict): The person's information in ArangoDB.
text (str): The text mentioning the person in the interrogation.
interrogation_key (str): The key identifying the interrogation.
if __name__ == '__main__': Returns:
None
"""
print_blue("Answer:", answer)
# If the answer is Yes
if answer == "Yes":
person["mentioned_in_interrogation"].remove(interrogation_key)
person_in_arango["confirmed"] = True
db.collection("persons").update(person)
person_in_arango["info"] += person["info"]
person_in_arango["mentioned_in_interrogation"] += ["mentioned_in_interrogation"]
from pprint import pprint
print("Updated person in arango:")
pprint(
db.collection("persons").insert(person_in_arango, overwrite_mode="update")
)
if person["mentioned_in_interrogation"] == [] and person['_key'] != person_in_arango['_key']:
db.collection("other_persons").insert(person, overwrite=True)
db.collection("persons").delete(person, check_rev=False)
print(f"Removed {person}")
# If the answer is No
if answer == "No":
if "is_not" not in person:
person["is_not"] = []
person["is_not"].append([person_in_arango["name"]])
db.collection("persons").update(person, merge=True, check_rev=False)
# If the answer is Unknown
if answer == "Unknown":
db.collection("unknown").insert(
{"name": person, "interrogation": interrogation_key}, overwrite=True
)
if __name__ == "__main__":
db = arango.db db = arango.db
persons = list(db.collection('persons').all()) persons = list(db.collection("persons").all())
q = 'for doc in persons filter doc.other == true return doc' q = "for doc in persons filter doc.other == true return doc"
other_persons = [i for i in db.aql.execute(q)] other_persons = [i for i in db.aql.execute(q)]
for person in other_persons: for person in other_persons:

@ -1,11 +1,23 @@
def print_green(text): def print_green(*args):
text = ''
for arg in args:
text += str(arg) + ' '
print(f"\033[92m{text}\033[0m") print(f"\033[92m{text}\033[0m")
def print_red(text): def print_red(*args):
text = ''
for arg in args:
text += str(arg) + ' '
print(f"\033[91m{text}\033[0m") print(f"\033[91m{text}\033[0m")
def print_yellow(text): def print_yellow(*args):
text = ''
for arg in args:
text += str(arg) + ' '
print(f"\033[93m{text}\033[0m") print(f"\033[93m{text}\033[0m")
def print_blue(text): def print_blue(*args):
text = ''
for arg in args:
text += str(arg) + ' '
print(f"\033[94m{text}\033[0m") print(f"\033[94m{text}\033[0m")

@ -0,0 +1,33 @@
import networkx as nx
from _arango import arango
import random
from print_color import *
import json
import datetime
# Create a new directed graph
G = nx.DiGraph()
q = "for doc in interrogations return doc"
interrogations = list(arango.db.aql.execute(q))
for interrogation in interrogations:
if not 'mentioned_persons' in interrogation:
continue
person_key = interrogation['person_id'].split('/')[1]
mentioned_persons = interrogation["mentioned_persons"]
for mentioned_person in mentioned_persons:
G.add_edge(
person_key,
mentioned_person,
label=interrogation["_key"],
)
# Write the graph to a GEXF file
current_time = datetime.datetime.now().strftime("%H-%M-%S")
filename = f"output_files/mentions_{current_time}.gexf"
nx.write_gexf(G, filename)

@ -0,0 +1,67 @@
import networkx as nx
from _arango import arango
import random
from print_color import *
import json
import datetime
# Create a new directed graph
G = nx.DiGraph()
q = "for doc in rumors filter doc.sexual_content != null return doc"
rumors = list(arango.db.aql.execute(q))
not_heard_from = 0
for rumor in rumors:
interrogation = arango.db.collection("interrogations").get(rumor["_key"])
if not rumor["sexual_summary"]:
rumor["sexual_summary"] = ""
# From person
if 'heard_from' not in rumor or not rumor['heard_from']:
not_heard_from += 1
rumor["heard_from"] = "Unknown_" + str(random.randint(1, 1000000))
# To person
if "person_id" in interrogation:
rumor["heard_person"] = interrogation["person"]
if 'class' not in rumor:
rumor['class'] = 'Unknown'
if 'class_description' not in rumor:
rumor['class_description'] = 'Unknown'
# Add an edge to the graph with 'sexual_summary' as an attribute
G.add_edge(
rumor["heard_from"],
rumor["heard_person"],
label=rumor["_key"],
content=rumor["sexual_summary"],
class_=rumor["class"],
class_description=rumor["class_description"],
)
heards_froms = set([rumor["heard_from"] for rumor in rumors])
heard_persons = set([rumor["heard_person"] for rumor in rumors])
all_nodes = list(heards_froms.union(heard_persons))
q = "for doc in persons filter doc.name in @all_nodes return doc"
persons = list(arango.db.aql.execute(q, bind_vars={"all_nodes": all_nodes}))
G.add_nodes_from(
[
(
person["name"],
{"_key": json.dumps(person["_key"]), "info": json.dumps(person["info"])},
)
for person in persons
]
)
# Write the graph to a GEXF file
current_time = datetime.datetime.now().strftime("%H-%M-%S")
filename = f"output_files/rumors_{current_time}.gexf"
nx.write_gexf(G, filename)
print(len(rumors))
print(not_heard_from)
Loading…
Cancel
Save