Refactor print_color functions to accept multiple arguments

main
lasseedfast 2 years ago
parent 5cc6b45673
commit cecd4c0f12
  1. 6
      Malå.py
  2. 95
      check_names_app.py
  3. 297
      extract_persons.py
  4. 201
      extract_rumors.py
  5. 149
      identify_person.py
  6. 20
      print_color.py
  7. 33
      viz_mentions.py
  8. 67
      viz_rumors.py

@ -0,0 +1,6 @@
import streamlit as st
st.set_page_config(
page_title="Malå",
)

@ -1,95 +0,0 @@
import streamlit as st
from identify_person import find_person, verify
from _arango import arango
import re
from time import time
@st.cache_data()
def get_persons():
return list(arango.db.collection('persons').all())
@st.cache_data()
def get_other_persons():
q = 'for doc in persons filter doc.other == true return doc'
return [i for i in db.aql.execute(q)]
@st.cache_data()
def get_suggestions(person):
print('Finding person', person)
with st.spinner('Hämtar data...'):
answers = find_person(person)
print('ANSWERS',answers)
return answers
db = arango.db
if 'persons' not in st.session_state:
st.session_state.persons = get_persons()
st.session_state.persons_names = [i['name'] for i in st.session_state.persons]
st.session_state.persons_dict = {i['name']: i['_key'] for i in st.session_state.persons}
if 'other_persons' not in st.session_state:
st.session_state.other_persons = get_other_persons()
if 'radio1' in st.session_state:
del st.session_state['radio1']
if 'suggestions' not in st.session_state:
st.session_state.suggestions = []
# If suggestions are all used up
if st.session_state.suggestions == []:
st.session_state.other_person = st.session_state.other_persons.pop(0)
suggestions = get_suggestions(st.session_state.other_person)
print(suggestions)
if suggestions == None:
st.rerun()
else:
st.session_state.suggestions = suggestions
st.session_state.suggestion = st.session_state.suggestions.pop(0)
answer, person_in_arango, interrogation_doc, other_person, found_person, found_person_info, person = st.session_state.suggestion
text = interrogation_doc['text']
answer = answer.replace('\n', ' ')
st.markdown(f'Är :blue[{other_person}] samma som :blue[{found_person}]?')
st.write(f'(från förhör med {interrogation_doc["person"]})')
if 'JA' in answer:
st.markdown(f'🤖\n:green[{answer}]')
radio_index = 0
elif 'NEJ' in answer:
radio_index = 1
st.markdown(f'🤖\n:red[{answer}]')
else:
radio_index = None
st.markdown(f'🤖\n{answer}')
with st.form('Form'):
with st.expander('Mer information'):
text = re.sub(r'\n\n+', '\n', text)
text = text.replace('\n', '<br>')
st.markdown(f'##### Förhöret:\n{text}', unsafe_allow_html=True)
st.markdown(f'##### {found_person}:')
st.markdown(found_person_info, unsafe_allow_html=True)
user_choice = st.radio('', ('Ja', 'Nej', 'Vet ej'), key=time(), index=radio_index)
other_choice = st.selectbox('Välj alla som stämmer', st.session_state.persons_names, placeholder='Sök en annan', index=None, key=time() + 'multiselect')
st.form_submit_button('Nästa')
if other_choice:
person_in_arango = db.collection('persons').get(st.session_state.persons_dict[other_choice])
print('Yes', person, person_in_arango, text, db)
#verify('Yes', person, person_in_arango, text, db)
elif user_choice == 'Ja':
print(('Yes', person, person_in_arango))
#verify('Yes', person, person_in_arango, text, db)
elif user_choice == 'Nej':
pass
elif user_choice == 'Vet ej':
pass
#verify('Unknown', person, person_in_arango, text, db)

@ -8,14 +8,66 @@ import random
from time import sleep
import traceback
from pprint import pprint
from print_color import *
def check_name(name, persons):
valid_name = True
name_parts = name.split()
if len(name_parts) > 1:
if name_parts[0].istitle() and name_parts[1].istitle():
valid_name = False
return valid_name
def check_name(person, answer_person, text):
print_yellow(person, " - ", answer_person)
same = False
# If full name similarity is below a threshold (e.g., 0.5), compare first names only
# If person only has one name, first or last, compare that to first and last name of answer_person
if len(person.strip().split()) == 1:
llm = LLM()
answer_first_name = answer_person.split()[0].strip()
answer_last_name = answer_person.split()[-1].strip()
if difflib.SequenceMatcher(None, person, answer_first_name).ratio() > 0.9:
if answer_last_name in text:
same = True
else:
# Count how many time the first name appears in the first_names list
first_names = [
i["name"].split()[0] for i in db.collection("persons").all()
]
first_name_count = first_names.count(answer_first_name)
if first_name_count == 1:
same = True
else:
answer = llm.generate(
f'Nämns någon med efternamnet "{answer_last_name}" i texten nedan?\n\n"""{text[:6000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"'
)
if "JA" in answer:
same = True
elif difflib.SequenceMatcher(None, person, answer_last_name).ratio() > 0.9:
if answer_first_name in text:
same = True
else:
answer = llm.generate(
f'Nämns någon med förnamnet "{answer_first_name}" i texten nedan?\n\n"""{text[:6000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"'
)
if "JA" in answer:
same = True
else:
name_similarity = difflib.SequenceMatcher(None, person, answer_person).ratio()
print("Similarity:", name_similarity)
# person_first_name = person.split()[0]
# answer_person_first_name = answer_person.split()[0]
# first_name_similarity = difflib.SequenceMatcher(
# None, person_first_name, answer_person_first_name
# ).ratio()
# person_last_name = person.split()[-1]
# answer_person_last_name = answer_person.split()[-1]
# print("new:", name_similarity)
if name_similarity > 0.9:
same = True
return same
def execute_query_with_retry(db, query, max_retries=5, delay=2):
@ -32,16 +84,26 @@ def execute_query_with_retry(db, query, max_retries=5, delay=2):
# If we've exhausted all retries, re-raise the last exception
raise
# Then, in your extract_persons function:
def extract_persons(interrogation):
known_persons = {
"Douglas": "Douglas Bengtsson",
"Rashid": "Rashid Sheiksaid",
"Emanuel": "Emanuel Johansson",
"Robert": "Robert Bengtsson",
}
sleep(random.uniform(0.05, 0.3))
print(interrogation["_key"].upper())
print("INTERROGATION:", interrogation["_key"])
q = "for doc in persons filter doc.other != true return doc"
result = execute_query_with_retry(db, q)
persons_docs = list(result)
persons = [i["name"].strip() for i in persons_docs]
first_names = {i["name"].split()[0].strip(): i["name"] for i in persons_docs}
persons_dict = {i["name"]: i for i in persons_docs}
@ -67,130 +129,144 @@ def extract_persons(interrogation):
Jag är inte intresserad av förhörsledaren eller personen som förhörs.'''
response = llm.generate(prompt)
response = re.sub(r"[^a-zA-ZåäöÅÄÖ\- ,]", "", response).replace(" namn ", "")
print(response)
for name in [i.strip() for i in response.split(",") if len(i) > 2]:
if name not in names:
names.append(name)
for name in names:
try:
# Compare the person to a list of known persons
prompt = f'''Jag vill veta vem {name} är. Kolla på förhöret nedan och svara om du hittar något om personen där.
"""{chunk}"""\n
Vem är {name}? Svara bara med sådant som finns i texten.'''
info = llm.generate(prompt)
person = None
# Reverse name
if name in persons:
person = persons_dict[name]
elif name.split().reverse() in persons:
print("Vände och hittade ✌", name.split().reverse())
person = persons_dict[name.split().reverse()]
else:
closest_matchs = difflib.get_close_matches(name, persons, n=8)
persons_string = "\n".join(closest_matchs)
prompt = f"""Jag behöver identifiera {name}. Nedan är en lista på personer det kanske skulle kunna vara:\n
{persons_string}\n
Är {name} någon av dessa personer? Ofta står personen bara med sitt förnamn eller efternamn, men försök att lista ut om det är någon av personerna ovan. Namnet kan också vara felstavat, men inte ett helt annat namn.
Svara BARA med namnet personen ur listan, och bara om du är helst säker att det verkligen är samma person. Är du inte säker svara "None"."""
answer_person = llm.generate(prompt)
if answer_person in persons and check_name(name, persons):
person = persons_dict[answer_person]
else:
q = f"for doc in persons return {{'name': doc['name'], 'info': doc['info']}}"
persons_arango_docs = list(db.aql.execute(q))
persons_with_info = [
f"{name} - {info}"
for i in persons_arango_docs
for name, info in i.items()
]
persons_with_info_string = "\n".join(persons_with_info)
prompt = f"""Här är mer information om möjliga personer:\n
{persons_with_info_string}\n
Försök att utifrån informationen ovan samt förhöret du tagit del av i tidigare meddelanden identifiera {name}.
Svara BARA med namnet personen ur listan, och bara om du är helst säker att det verkligen är samma person. Är du inte säker svara "None"."""
answer_person = llm.generate(prompt)
if answer_person in persons and check_name(name, persons):
person = persons_dict[answer_person]
if person:
print(f"\033[92m{name} found in database: {person['name']}\033[0m")
if "info" not in person:
person["info"] = []
if info not in person["info"]:
person["info"].append(info)
if "mentioned_in_interrogation" not in person:
person["mentioned_in_interrogation"] = []
if interrogation["_key"] not in person["mentioned_in_interrogation"]:
person["mentioned_in_interrogation"].append(interrogation["_key"])
db.collection("persons").update(person, check_rev=False)
# db.collection("all_relations").insert(
# {
# "_from": interrogation["person_id"],
# "_to": person["_id"],
# "relation": "mentioned_by",
# "mentions": [{'interrogation': interrogation["_key"], "date": interrogation["date"], "mentioned_as": info}],
# "_key": f'{interrogation["_key"]}_{person["_key"]}'
# },
# overwrite_mode="update",
# merge=True,
# )
else:
print(f"\033[91m{name} not found in database\033[0m")
# Compare the person to a list of known persons
prompt = f'''Jag vill veta vem {name} är. Kolla på förhöret nedan och svara om du hittar något om personen där.
"""{chunk}"""\n
Vem är {name}? Svara bara med sådant som finns i texten.'''
info = llm.generate(prompt)
person = None
# Reverse name
if name in persons:
person = persons_dict[name]
elif name in known_persons:
person = persons_dict[known_persons[name]]
elif name.split().reverse() in persons:
print("Vände och hittade ✌", name.split().reverse())
person = persons_dict[name.split().reverse()]
else:
closest_matches = difflib.get_close_matches(name, persons, n=4, cutoff=0.3)
if name.split()[0] in first_names:
if first_names[name.split()[0]] not in closest_matches:
closest_matches.append(first_names[name.split()[0]])
persons_string = "\n".join(closest_matches)
prompt = f"""Jag behöver identifiera {name}. Nedan är en lista på personer det kanske skulle kunna vara:\n
{persons_string}\n
Är {name} någon av dessa personer? I texten kan personen stå med bara sitt förnamn eller efternamn, kolla speciellt efter namn i listan där förnamnet eller efternamnet stämmer. Namnet kan också vara felstavat, men inte ett helt annat namn.
Svara BARA med namnet personen ur listan. Är du inte säker svara "None"."""
answer_person = llm.generate(prompt)
if answer_person in persons and check_name(
name, answer_person, interrogation["text"]
):
person = persons_dict[answer_person]
if person:
print_green(f'{name} identified: {person["name"]}', "\n")
if "info" not in person:
person["info"] = []
if info not in person["info"]:
person["info"].append(info)
if "mentioned_in_interrogation" not in person:
person["mentioned_in_interrogation"] = []
if interrogation["_key"] not in person["mentioned_in_interrogation"]:
person["mentioned_in_interrogation"].append(interrogation["_key"])
if "mentioned_as" not in person:
person["mentioned_as"] = []
if {name: interrogation["_key"]} not in person["mentioned_as"]:
person["mentioned_as"].append({name: interrogation["_key"]})
db.collection("persons").update(person, check_rev=False)
# db.collection("all_relations").insert(
# {
# "_from": interrogation["person_id"],
# "_to": person["_id"],
# "relation": "mentioned_by",
# "mentions": [{'interrogation': interrogation["_key"], "date": interrogation["date"], "mentioned_as": info}],
# "_key": f'{interrogation["_key"]}_{person["_key"]}'
# },
# overwrite_mode="update",
# merge=True,
# )
else:
print(f"\033[91m{name} not identified\033[0m")
print_yellow(
"\n".join([f"- {i}" for i in persons_string.split("\n")]), "\n"
)
print()
_key = arango.fix_key_name(name)
doc = db.collection("persons").insert(
{
_key = arango.fix_key_name(name) #TODO Are there multiple persons with the same name?
# If no confirmed person was identified, create a new person or add to another unconfirmed person
if not db.collection("persons").get(_key):
if db.collection("persons").get(_key):
doc = db.collection("persons").get(_key)
else:
doc = {
"_key": _key,
"name": name,
"info": info,
"info": [info],
"other": True,
"confirmed": False,
"mentioned_in_interrogation": [interrogation["_key"]],
},
overwrite_mode="update",
merge=True,
)
}
# db.collection("all_relations").insert(
# {
# "_from": interrogation["person_id"],
# "_to": doc["_id"],
# "relation": "mentioned_by",
# 'other': True,
# "mentions": [{'interrogation': interrogation["_key"], "date": interrogation["date"], "mentioned_as": info}],
# "_key": f'{interrogation["_key"]}_{doc["_key"]}'
# },
# overwrite_mode="update",
# merge=True,
# )
except Exception as e:
traceback.print_exc()
print(f"\033[91mError when processing {name}: {e}\033[0m")
else:
doc = db.collection("persons").get(_key)
if interrogation["_key"] not in doc["mentioned_in_interrogation"]:
doc["mentioned_in_interrogation"].append(interrogation["_key"])
if info not in doc["info"]:
doc["info"].append(info)
db.collection("persons").insert(doc, merge=False, overwrite_mode='update')
# db.collection("all_relations").insert(
# {
# "_from": interrogation["person_id"],
# "_to": doc["_id"],
# "relation": "mentioned_by",
# 'other': True,
# "mentions": [{'interrogation': interrogation["_key"], "date": interrogation["date"], "mentioned_as": info}],
# "_key": f'{interrogation["_key"]}_{doc["_key"]}'
# },
# overwrite_mode="update",
# merge=True,
# )
if __name__ == "__main__":
db = arango.db
q = "for doc in interrogations return doc"
q = 'for doc in interrogations filter doc._key == "Markus_Lindahl_2023-02-20_p.93" return doc'
interrogations = list(db.aql.execute(q))
interrogations.sort(key=lambda x: x["date"])
persons = list(db.collection("persons").all())
interrogations_done = []
for person in persons:
if "mentioned_in_interrogation" in person and person["mentioned_in_interrogation"]:
if (
"mentioned_in_interrogation" in person
and person["mentioned_in_interrogation"]
):
for interrogation in person["mentioned_in_interrogation"]:
interrogations_done.append(interrogation)
interrogations = [interrogation for interrogation in interrogations if interrogation["_key"] not in set(interrogations_done)]
print("Number of interrogations to process:", len(interrogations))
# interrogations = [
# interrogation
# for interrogation in interrogations
# if interrogation["_key"] not in set(interrogations_done)
# ]
# print("Number of interrogations to process:", len(interrogations))
# q = 'for doc in interrogations return doc'
# interrogations = list(db.aql.execute(q))
@ -201,7 +277,8 @@ if __name__ == "__main__":
# interrogations = [interrogation for interrogation in interrogations if interrogation['_key'] not in rumors]
# print('Number of interrogations to process:', len(interrogations))
# for interrogation in interrogations:
# extract_persons(interrogation)
with multiprocessing.Pool() as pool:
for interrogation in interrogations:
extract_persons(interrogation)
exit()
with multiprocessing.Pool(processes=3) as pool:
pool.map(extract_persons, interrogations)

@ -2,16 +2,17 @@ import re
from _arango import arango
from _llm import LLM
from pprint import pprint
from pprint import pprint
from langchain_text_splitters import CharacterTextSplitter
import multiprocessing
from print_color import *
class Interrogation:
def __init__(self, interrogation):
self.interrogation = interrogation
self.llm = LLM(chat=True)
self.llm_checker = LLM(chat=False)
self.text = interrogation['text']
self.text = interrogation["text"]
# Info to collect
self.sexual_content = None
@ -25,7 +26,7 @@ class Interrogation:
self.heard_from = None
self.sexual_chunk = None
self.sexual_summary = None
self.self_heard_from_id = None
self.heard_from_id = None
self.text_splitter = CharacterTextSplitter(
separator="\n\n",
@ -33,22 +34,36 @@ class Interrogation:
chunk_overlap=0,
length_function=len,
is_separator_regex=False,
)
self.chunks = self.text_splitter.split_text(self.text)
if 'mentioned_persons' in interrogation:
q = f'''for doc in persons filter doc._id in ["{'","'.join(interrogation["mentioned_persons"])}"] return doc'''
print(q)
self.mentioned_persons = list(arango.db.aql.execute(q))
self.mentioned_in_interrogation = [i['name'] for i in self.mentioned_persons]
self.mentioned_in_interrogations_dict = {i['name']: i['_id'] for i in self.mentioned_persons}
print(self.mentioned_in_interrogation)
if (
"mentioned_persons" in interrogation
and interrogation["mentioned_persons"] != []
):
self.mentioned_persons = interrogation["mentioned_persons"]
self.mentioned_persons = list(
db.aql.execute(
"for doc in persons filter doc._key in @keys return doc",
bind_vars={"keys": self.mentioned_persons},
)
)
self.mentioned_persons_dict = {}
for person in self.mentioned_persons:
mentioned_as_name = None
if "mentioned_as" in person:
for i in person["mentioned_as"]:
name, interrogation_key = list(i.items())[0]
if interrogation_key == self.interrogation["_key"]:
mentioned_as_name = name
if not mentioned_as_name:
mentioned_as_name = person["name"]
self.mentioned_persons_dict[mentioned_as_name] = person["_key"]
else:
self.mentioned_in_interrogation = None
self.mentioned_persons = None
self.mentioned_persons_dict = None
def find_sexual_content(self, chunk, check_text=False):
prompt = f'''
Texten nedan är en del av ett polisförhör.
@ -72,9 +87,9 @@ class Interrogation:
else:
response = self.llm.generate(prompt)
if 'JA' in response:
if "JA" in response:
sexual_content = True
elif 'NEJ' in response:
elif "NEJ" in response:
sexual_content = False
else:
@ -87,97 +102,102 @@ class Interrogation:
if sexual_content:
self.sexual_chunk = chunk
prompt = f'''Beskriv det sexuella innehållet i förhöret.'''
prompt = f"""Beskriv det sexuella innehållet i förhöret."""
self.sexual_content_description = self.llm.generate(prompt)
self.extract_sexual_info(chunk)
prompt = f'''Ungefär när i tiden hände det som personen berättar om?'''
prompt = f"""Ungefär när i tiden hände det som personen berättar om?"""
self.sexual_content_date = self.llm.generate(prompt)
def find_self_experience(self):
prompt = f'Har personen som förhörs själv varit med om något av det som beskrivs? Svara ENBART med "JA" eller "NEJ".'
response = self.llm.generate(prompt)
if 'JA' in response:
if "JA" in response:
self.self_experience = True
elif 'NEJ' in response:
elif "NEJ" in response:
self.self_experience = False
else:
self.self_experience = None
def find_self_involvement(self):
prompt = f'Har personen som förhörs själv varit inblandad på något sätt? Svara ENBART med "JA" eller "NEJ".'
response = self.llm.generate(prompt)
if 'JA' in response:
if "JA" in response:
self.self_involvement = True
prompt = f'''På vilket sätt har personen som förhörs varit inblandad?'''
prompt = f"""På vilket sätt har personen som förhörs varit inblandad?"""
self.self_involvement_type = self.llm.generate(prompt)
elif 'NEJ' in response:
elif "NEJ" in response:
self.self_involvement = False
else:
self.self_involvement = None
def find_heard_about(self):
prompt = f'''Har personen hört talas om något av det som beskrivs? Svara ENBART med "JA" eller "NEJ".'''
prompt = f"""Har personen hört talas om något av det som beskrivs? Svara ENBART med "JA" eller "NEJ"."""
response = self.llm.generate(prompt)
if 'JA' in response:
if "JA" in response:
self.heard_about = True
self.find_heard_from()
elif 'NEJ' in response:
elif "NEJ" in response:
self.heard_about = False
else:
self.heard_about = None
def find_heard_from(self):
prompt = f'Av vem har personen hört det som beskrivs? Svara bara med namnet på personen, eller vad personen kallas.'
if self.mentioned_persons_dict:
list_name = "\n".join(self.mentioned_persons_dict.keys())
prompt = f"Av vem har personen hört det som beskrivs? Är det av någon av personerna nedan?\n\n{list_name}\n\nSvara ENBART med namnet på personen. Om inte det inte är någon av personerna i listan, svara bara None."
answer = self.llm.generate(prompt)
print_blue("Hört av:", answer)
heard_from = self.llm.generate(prompt)
if answer in self.mentioned_persons_dict:
print_green("I DB:", self.mentioned_persons_dict[answer])
self.heard_from = answer
self.heard_from_id = "persons/" + self.mentioned_persons_dict[answer]
if self.mentioned_in_interrogation:
mentioned_in_interrogation = '\n'.join(self.mentioned_in_interrogation)
prompt = f'''Jag behöver identifiera vem {heard_from} är, verkar {heard_from} vara någon av följande personer:\n
{mentioned_in_interrogation}
Svara ENBART med med namnet personen det skulle kunna vara. Om du inte vet svara "Jag vet inte".
'''
heard_from_answer = self.llm.generate(prompt)
if heard_from_answer in self.mentioned_in_interrogation:
self.heard_from = heard_from_answer
self.self_heard_from_id = self.mentioned_in_interrogations_dict[heard_from_answer]
else:
mentioned_info = '\n\n'.join([f'{i["name"].upper()}\n{i["info"]}' for i in self.mentioned_persons])
prompt = f'''Här är mer information om möjliga personer:\n
{mentioned_info}\n
Kan du utifrån den säga vem {heard_from} är? Svara BARA med namnet personen ur listanÄr du inte säker svara "Jag vet inte".
'''
heard_from_answer_info = self.llm.generate(prompt)
if heard_from_answer_info in self.mentioned_in_interrogation:
self.heard_from = heard_from_answer_info
self.self_heard_from_id = self.mentioned_in_interrogations_dict[heard_from_answer_info]
if not self.heard_from:
self.heard_from = heard_from
prompt = f"Av vem har personen i så fall hört det som beskrivs? Svara bara med namnet på personen, eller vad personen kallas."
self.heard_from = self.llm.generate(prompt)
print_yellow("Hört av:", self.heard_from)
if self.mentioned_persons:
mentioned_persons_with_info = db.aql.execute(
'for doc in persons filter doc._key in @keys return {"name":doc.name, "info":doc.info}',
bind_vars={"keys": self.mentioned_persons},
)
mentioned_info = "\n\n".join(
[
f'{i["name"].upper()}\n{i["info"]}'
for i in mentioned_persons_with_info
]
)
prompt = f"""Här är mer information om möjliga personer:\n
{mentioned_info}\n
Kan du utifrån den säga vem {self.heard_from} är? Svara BARA med namnet personen ur listanÄr du inte säker svara "Jag vet inte".
"""
heard_from_answer_info = self.llm.generate(prompt)
if heard_from_answer_info in self.mentioned_persons:
self.heard_from = heard_from_answer_info
self.heard_from_id = self.mentioned_persons_dict[heard_from_answer_info]
def create_arango_doc(self):
return {
'_key': self.interrogation['_key'],
'sexual_content': self.sexual_content,
'sexual_content_description': self.sexual_content_description,
'self_experience': self.self_experience,
'self_involvement': self.self_involvement,
'self_involvement_type': self.self_involvement_type,
'heard_about': self.heard_about,
'heard_from': self.heard_from,
'interrogation_date': self.interrogation['date'],
'sexual_content_date': self.sexual_content_date,
'sexual_info': '\n'.join(self.sexual_info),
'sexual_summary': self.sexual_summary,
}
"_key": self.interrogation["_key"],
"sexual_content": self.sexual_content,
"sexual_content_description": self.sexual_content_description,
"self_experience": self.self_experience,
"self_involvement": self.self_involvement,
"self_involvement_type": self.self_involvement_type,
"heard_about": self.heard_about,
"heard_from": self.heard_from,
"heard_from_id": self.heard_from_id,
"interrogation_key": self.interrogation["_key"],
"interrogation_date": self.interrogation["date"],
"sexual_content_date": self.sexual_content_date,
"sexual_info": "\n".join(self.sexual_info),
"sexual_summary": self.sexual_summary,
}
def extract_sexual_info(self, chunk):
@ -206,25 +226,31 @@ class Interrogation:
chunk = self.sexual_chunk
index_of_chunk = self.chunks.index(chunk)
if index_of_chunk != len(self.chunks)-1:
remaining_chunks = self.chunks[index_of_chunk+1:]
if index_of_chunk != len(self.chunks) - 1:
remaining_chunks = self.chunks[index_of_chunk + 1 :]
for chunk in remaining_chunks:
self.extract_sexual_info(chunk)
sexual_info_string = '\n'.join(self.sexual_info)
sexual_info_string = "\n".join(self.sexual_info)
prompt = f'Nedan är innehåll som samlats in ur förhöret:\n\n"""{sexual_info_string}"""\n\nSammanfatta innehållet på ett detaljerat vis.'
self.sexual_summary = self.llm.generate(prompt)
def add_to_arango(self):
arango_doc = self.create_arango_doc()
db.collection('rumors').insert(arango_doc, overwrite=True, keep_none=False)
if arango_doc['sexual_content']:
pprint(arango_doc)
db.collection("rumors").insert(arango_doc, overwrite=True, keep_none=False)
def process_interrogation(interrogation_data):
"""
Process an interrogation by analyzing its content for sexual content and storing it in ArangoDB.
Args:
interrogation_data (dict): The data of the interrogation.
Returns:
None
"""
interrogation = Interrogation(interrogation_data)
for chunk in interrogation.chunks:
@ -242,7 +268,7 @@ def process_interrogation(interrogation_data):
interrogation.find_heard_about()
else:
interrogation.find_heard_about()
interrogation.collect_sexual_info()
interrogation.add_to_arango()
break
@ -253,18 +279,21 @@ def process_interrogation(interrogation_data):
if __name__ == "__main__":
db = arango.db
q = 'for doc in interrogations return doc'
q = "for doc in interrogations return doc"
interrogations = list(db.aql.execute(q))
# Filter out interrogations that have their _key in the rumors collection
q = 'for rumor in rumors return rumor._key'
q = "for rumor in rumors return rumor._key"
rumors = list(db.aql.execute(q))
interrogations = [interrogation for interrogation in interrogations if interrogation['_key'] not in rumors]
print('Number of interrogations to process:', len(interrogations))
for i in interrogations:
process_interrogation(i)
exit()
with multiprocessing.Pool() as pool:
interrogations = [
interrogation
for interrogation in interrogations
if interrogation["_key"] not in rumors
]
print("Number of interrogations to process:", len(interrogations))
# for i in interrogations:
# process_interrogation(i)
# exit()
with multiprocessing.Pool(3) as pool:
pool.map(process_interrogation, interrogations)

@ -5,28 +5,6 @@ from pprint import pprint
from print_color import *
import multiprocessing
def add_persons_to_chroma():
db = arango.db
q = "for doc in persons filter doc.other != true return doc"
persons = list(db.aql.execute(q))
# Lists to store the documents, metadatas and ids
documents = []
metadatas = []
ids = []
for person in persons:
if 'info' in person:
info = '\n'.join(person['info'])
documents.append(person['name'])
#documents.append(f"{person['name']}\n{info}")
metadata = {'name': person['name'], '_key': person['_key']}
metadatas.append(metadata)
ids.append(person["_key"])
collection = chroma.client.get_collection('mala_persons')
collection.add(documents=documents, metadatas=metadatas, ids=ids)
def find_person(person):
"""
@ -48,32 +26,38 @@ def find_person(person):
db = arango.db
llm = LLM()
other_person = person['name']
other_person = person["name"]
chroma = ChromaDB()
col = chroma.client.get_or_create_collection('mala_persons')
hits = col.query(query_texts=[other_person], n_results=1)
found_person = hits['documents'][0][0]
found_person_key = hits['metadatas'][0][0]['_key']
distance = hits['distances'][0][0]
#* Filter out hits with distance > 1
col = chroma.client.get_or_create_collection("mala_persons")
if "is_not" not in other_person:
filter_isnot = {}
else:
filter_isnot = {"name": {"$nin": other_person["is_not"]}}
# Do a query to find the person
hits = col.query(query_texts=[other_person], n_results=1, where=filter_isnot)
found_person = hits["documents"][0][0]
found_person_key = hits["metadatas"][0][0]["_key"]
distance = hits["distances"][0][0]
# * Filter out hits with distance > 1
if distance > 1:
return None
found_person_in_arango = db.collection('persons').get(found_person_key)
found_person_info = '\n'.join(found_person_in_arango['info'])
return []
found_person_in_arango = db.collection("persons").get(found_person_key)
found_person_info = "\n".join(found_person_in_arango["info"])
prompt = f'Nedan är olika bitar med information om en person:\n\n{found_person_info}\n\nSammanfatta dessa på ett detaljerat sätt, var noga med namn, platser, händelser och relationer. Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat. '
prompt = f"Nedan är olika bitar med information om en person:\n\n{found_person_info}\n\nSammanfatta dessa på ett detaljerat sätt, var noga med namn, platser, händelser och relationer. Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat. "
person_in_arango_summary = llm.generate(prompt)
# Write summary about the person
interrogations = person['mentioned_in_interrogation']
interrogations = person["mentioned_in_interrogation"]
output = []
for interrogation in interrogations:
interrogation_doc = db.collection('interrogations').get(interrogation)
text = interrogation_doc['text']
interrogation_doc = db.collection("interrogations").get(interrogation)
text = interrogation_doc["text"]
prompt = f'''I texten nedan omnämns en "{other_person}" och jag försöker förstå om det kan vara exempelvis ett felstavat namn eller smeknamn för en annan person.\n
TEXT:
@ -86,31 +70,86 @@ def find_person(person):
# Om istället förnamnet eller efternamnet är helt olika så är det förmodligen inte samma person.Om det bara är ett namn (inget efternamn) kan det också handla om ett smeknamn eller en beskrivning.
answer = llm.generate(prompt)
output.append((answer, found_person_in_arango, interrogation_doc, other_person, found_person, found_person_info, person))
output.append(
(
answer,
found_person_in_arango,
interrogation_doc,
other_person,
found_person,
found_person_info,
person,
)
)
return output
def verify(answer, person, person_in_arango, text, db):
if answer == 'Yes':
person['mentioned_in_interrogation'].remove(text)
db.collection('persons').update(person)
person_in_arango['info'] += person['info']
person_in_arango['mentioned_in_interrogation'] += ['mentioned_in_interrogation']
db.collection('persons').update(person_in_arango)
db.collection('other_persons').insert(person, overwrite=True)
db.collection('persons').delete(person, check_rev=False)
def verify(
db,
answer=None,
person=None,
person_in_arango=None,
interrogation_key=None,
):
"""
Verifies the answer for a person's identification in an interrogation.
Args:
db: The database object.
answer (str): The answer for the person's identification. Can be "Yes", "No", or "Unknown".
person (dict): The person's information.
person_in_arango (dict): The person's information in ArangoDB.
text (str): The text mentioning the person in the interrogation.
interrogation_key (str): The key identifying the interrogation.
if __name__ == '__main__':
Returns:
None
"""
print_blue("Answer:", answer)
# If the answer is Yes
if answer == "Yes":
person["mentioned_in_interrogation"].remove(interrogation_key)
person_in_arango["confirmed"] = True
db.collection("persons").update(person)
person_in_arango["info"] += person["info"]
person_in_arango["mentioned_in_interrogation"] += ["mentioned_in_interrogation"]
from pprint import pprint
print("Updated person in arango:")
pprint(
db.collection("persons").insert(person_in_arango, overwrite_mode="update")
)
if person["mentioned_in_interrogation"] == [] and person['_key'] != person_in_arango['_key']:
db.collection("other_persons").insert(person, overwrite=True)
db.collection("persons").delete(person, check_rev=False)
print(f"Removed {person}")
# If the answer is No
if answer == "No":
if "is_not" not in person:
person["is_not"] = []
person["is_not"].append([person_in_arango["name"]])
db.collection("persons").update(person, merge=True, check_rev=False)
# If the answer is Unknown
if answer == "Unknown":
db.collection("unknown").insert(
{"name": person, "interrogation": interrogation_key}, overwrite=True
)
if __name__ == "__main__":
db = arango.db
persons = list(db.collection('persons').all())
q = 'for doc in persons filter doc.other == true return doc'
persons = list(db.collection("persons").all())
q = "for doc in persons filter doc.other == true return doc"
other_persons = [i for i in db.aql.execute(q)]
for person in other_persons:
print(find_person(person))
exit()
# with multiprocessing.Pool() as pool:
# pool.map(find_person, other_persons)
# pool.map(find_person, other_persons)

@ -1,11 +1,23 @@
def print_green(text):
def print_green(*args):
text = ''
for arg in args:
text += str(arg) + ' '
print(f"\033[92m{text}\033[0m")
def print_red(text):
def print_red(*args):
text = ''
for arg in args:
text += str(arg) + ' '
print(f"\033[91m{text}\033[0m")
def print_yellow(text):
def print_yellow(*args):
text = ''
for arg in args:
text += str(arg) + ' '
print(f"\033[93m{text}\033[0m")
def print_blue(text):
def print_blue(*args):
text = ''
for arg in args:
text += str(arg) + ' '
print(f"\033[94m{text}\033[0m")

@ -0,0 +1,33 @@
import networkx as nx
from _arango import arango
import random
from print_color import *
import json
import datetime
# Create a new directed graph
G = nx.DiGraph()
q = "for doc in interrogations return doc"
interrogations = list(arango.db.aql.execute(q))
for interrogation in interrogations:
if not 'mentioned_persons' in interrogation:
continue
person_key = interrogation['person_id'].split('/')[1]
mentioned_persons = interrogation["mentioned_persons"]
for mentioned_person in mentioned_persons:
G.add_edge(
person_key,
mentioned_person,
label=interrogation["_key"],
)
# Write the graph to a GEXF file
current_time = datetime.datetime.now().strftime("%H-%M-%S")
filename = f"output_files/mentions_{current_time}.gexf"
nx.write_gexf(G, filename)

@ -0,0 +1,67 @@
import networkx as nx
from _arango import arango
import random
from print_color import *
import json
import datetime
# Create a new directed graph
G = nx.DiGraph()
q = "for doc in rumors filter doc.sexual_content != null return doc"
rumors = list(arango.db.aql.execute(q))
not_heard_from = 0
for rumor in rumors:
interrogation = arango.db.collection("interrogations").get(rumor["_key"])
if not rumor["sexual_summary"]:
rumor["sexual_summary"] = ""
# From person
if 'heard_from' not in rumor or not rumor['heard_from']:
not_heard_from += 1
rumor["heard_from"] = "Unknown_" + str(random.randint(1, 1000000))
# To person
if "person_id" in interrogation:
rumor["heard_person"] = interrogation["person"]
if 'class' not in rumor:
rumor['class'] = 'Unknown'
if 'class_description' not in rumor:
rumor['class_description'] = 'Unknown'
# Add an edge to the graph with 'sexual_summary' as an attribute
G.add_edge(
rumor["heard_from"],
rumor["heard_person"],
label=rumor["_key"],
content=rumor["sexual_summary"],
class_=rumor["class"],
class_description=rumor["class_description"],
)
heards_froms = set([rumor["heard_from"] for rumor in rumors])
heard_persons = set([rumor["heard_person"] for rumor in rumors])
all_nodes = list(heards_froms.union(heard_persons))
q = "for doc in persons filter doc.name in @all_nodes return doc"
persons = list(arango.db.aql.execute(q, bind_vars={"all_nodes": all_nodes}))
G.add_nodes_from(
[
(
person["name"],
{"_key": json.dumps(person["_key"]), "info": json.dumps(person["info"])},
)
for person in persons
]
)
# Write the graph to a GEXF file
current_time = datetime.datetime.now().strftime("%H-%M-%S")
filename = f"output_files/rumors_{current_time}.gexf"
nx.write_gexf(G, filename)
print(len(rumors))
print(not_heard_from)
Loading…
Cancel
Save