Refactor print_color functions to accept multiple arguments

main
lasseedfast 2 years ago
parent b5ad23f652
commit 744b1f02f2
  1. 5
      Malå.py
  2. 335
      Verifiera_namn_.py
  3. 47
      _arango.py
  4. 93
      _chroma.py
  5. 88
      _llm.py
  6. 76
      _openai.py
  7. 91
      app.py
  8. 43
      app_test.py
  9. 72
      arango_admin.py
  10. 142
      extract_fup.py
  11. 248
      extract_persons.py
  12. 87
      extract_relations.py
  13. 25
      extract_rumors.py
  14. 26
      fix_relations.py
  15. 328
      identify_person.py
  16. 34
      identify_persons_in_relations.py
  17. 423
      person_identifier.py
  18. 11
      persons.py
  19. 51
      print_color.py
  20. 15
      saturday.py
  21. 94
      testopenai_chat.py

@ -4,3 +4,8 @@ import streamlit as st
st.set_page_config( st.set_page_config(
page_title="Malå", page_title="Malå",
) )
st.markdown('##### Välj något av alternativen till vänster.')
st.markdown('**Fråga om förhör** är en chatt där du kan fråga om förhör.')
st.markdown('**Personer** är en lista över personer i utredningen.')
st.markdown('**Verifiera namn** är en sida där du kan verifiera overifierade namn.')

@ -0,0 +1,335 @@
import streamlit as st
from identify_person import identify, verify, find_person, UnverifiedPerson, FoundPerson
from _arango import arango
import re
from fuzzywuzzy import process
from _llm import LLM as LLM_garda
from _openai import LLM_OpenAI as LLM
from print_color import *
from random import randint
# from print_color import *
print("Start")
def reset_choices():
st.session_state.user_choice = None
st.session_state.unconfirmed_choice = None
st.session_state.custom_choice = None
def check_if_dict_in_list(target_dict, list_of_dicts):
target_key, target_value = list(target_dict.items())[0]
for dict_ in list_of_dicts:
key, value = list(dict_.items())[0]
if key == target_key and value == target_value:
return True
return False
def submitted():
st.session_state.next = True
@st.cache_data()
def sort_names_by_similarity(target_name, name_list):
# Get a list of tuples (name, score)
scored_names = process.extract(target_name, name_list, limit=len(name_list))
# Sort the list of tuples by score in descending order
scored_names.sort(key=lambda x: x[1], reverse=True)
# Extract the sorted list of names
sorted_names = [name for name, score in scored_names]
return sorted_names
@st.cache_data()
def get_persons():
return list(arango.db.collection("persons").all())
@st.cache_data()
def get_unverified_persons():
q = "for doc in persons filter doc.confirmed != true return doc"
return [i for i in db.aql.execute(q)]
@st.cache_data()
def get_suggestions(person):
suggestion = identify(person)
st.session_state.unverified_person = suggestion["unverified_person"]
st.session_state.found_person = suggestion["found_person"]
st.session_state.suggestions = suggestion["suggestions"]
def caps(string):
if "*" not in string:
string = string.upper()
else:
string = string.replace("*", "")
return string
def get_unverified_person():
try:
n = randint(0, len(st.session_state.unverified_persons) - 1)
doc = st.session_state.unverified_persons.pop(n)
st.session_state.unverified_person = UnverifiedPerson(doc)
except ValueError:
st.markdown(":green[Inga fler personer att verifiera.]")
st.stop()
st.set_page_config(
page_title="Malå",
)
# Get URL parameters
params = st.query_params
param_person_key = params.get("person_key", None)
db = arango.db
# Add a session state to store the persons and unconfirmed persons etc
if "next" not in st.session_state:
st.session_state.next = False
if "persons" not in st.session_state:
st.session_state.persons = get_persons()
all_persons_name_list = []
for person in st.session_state.persons:
name = person["name"]
if not person["confirmed"]:
name += "*"
all_persons_name_list.append(name)
st.session_state.persons_names = all_persons_name_list
st.session_state.persons_dict = {
i["name"]: i["_key"] for i in st.session_state.persons
}
if "unverified_persons" not in st.session_state:
if param_person_key:
# If a person key is provided in the URL, only show that person
st.session_state.unverified_persons = list(
db.aql.execute(
"for doc in persons filter doc._key == @key return doc",
bind_vars={"key": param_person_key},
)
)
print_blue("param_person_key".upper(), st.session_state.unverified_persons)
else:
st.session_state.unverified_persons = get_unverified_persons()
if "persons_names" not in st.session_state:
st.session_state.persons_names = arango.get_persons(confirmed=False)["names"]
if "user_choice" not in st.session_state:
st.session_state.user_choice = None
if "unconfirmed_choice" not in st.session_state:
st.session_state.unconfirmed_choice = None
if "custom_choice" not in st.session_state:
st.session_state.custom_choice = None
if (
"unverified_person" not in st.session_state
or not st.session_state.unverified_persons
):
get_unverified_person()
if "found_person" not in st.session_state:
st.session_state.found_person = None
if "suggestions" not in st.session_state:
# Set new values for unverified_person, found_person and suggestions (as session_state)
get_suggestions(st.session_state.unverified_person.__dict__)
print_yellow("SUGGESTIONS", st.session_state.suggestions)
if "suggestion" not in st.session_state:
st.session_state.suggestion = st.session_state.suggestions.pop(0)
# Get unconfirmed person, found person and answer from the suggestions
unverified_person: UnverifiedPerson = st.session_state.unverified_person
found_person: FoundPerson = st.session_state.found_person
answer = st.session_state.suggestion[0]
interrogation_doc = st.session_state.suggestion[1]
if isinstance(interrogation_doc, str):
interrogation_doc = db.collection("interrogations").get(interrogation_doc)
text = interrogation_doc["text"]
st.markdown(
f'Namnet **"{st.session_state.unverified_person.name}"** används i **{len(st.session_state.unverified_person.mentioned_in_interrogation)}** förhör. Namnet kan syfta på olika personer i olika sammanhang så vi går igenom förhören ett och ett.'
)
if answer:
answer = answer.replace("\n", " ")
st.markdown(
f"Är :blue[{unverified_person.name}] samma som :blue[{found_person.name}]?"
)
print(found_person.__dict__)
st.write(f'(från förhör med {interrogation_doc["person"]})')
if "JA" in answer:
st.markdown(f"🤖\n:green[{answer.replace('JA ', '')}]")
radio_index = 0
elif "NEJ" in answer:
radio_index = 1
st.markdown(f"🤖\n:red[{answer.replace('NEJ ', '')}]")
else:
radio_index = None
st.markdown(f"🤖\n{answer}")
# Let the user expand for more info
else:
st.markdown(f"Vem är :blue[{unverified_person.name}]?")
st.write(f'(från förhör med {interrogation_doc["person"]})')
# Show the information about the suggested person
# Edit and show the interrogation text
with st.expander(f"Mer information om förhöret"):
text = text.replace("\nFL:", "<br>**FL:** ").replace("\nDH:", "<br>**DH:** ")
text = re.sub(r"\n(?!\n)", "", text)
text = re.sub(r"\n\n+", "\n", text)
text = text.replace("\n", "<br>").replace(
unverified_person.name, f"**:red[{unverified_person.name}]**"
)
st.markdown(f"##### Förhöret:\n{text}", unsafe_allow_html=True)
# A form to let the user select an alternative
with st.form("select_alternative"):
if answer:
# Let the user decide if the suggested person is the same as the unconfirmed person
st.session_state.user_choice = st.radio(
"Select alternative",
("Ja", "Nej", "Vet ej"),
key="user_choice_radio",
index=radio_index,
)
else:
st.session_state.user_choice = None
# Let the user select an alternative person
alternatives = sort_names_by_similarity(
unverified_person.name, st.session_state.persons_names
)
st.session_state.unconfirmed_choice = st.selectbox(
"Välj någon som stämmer",
alternatives,
placeholder="Sök en annan",
index=None,
key="multiselect",
format_func=caps,
help="Personer i caps är bekräftade personer, välj någon av dem om det verkar stämma.",
)
# Let the user enter a custom alternative
st.session_state.custom_choice = st.text_input(
"Annan person",
key="custom",
help="Skriv in namnet på personen om det inte finns i listan. Var noga med stavningen.",
)
st.session_state.custom_choice = (
None if st.session_state.custom_choice == "" else st.session_state.custom_choice
)
# If the user has made a selection
st.session_state.next = st.form_submit_button("Nästa", on_click=submitted)
if st.session_state.next:
if st.session_state.custom_choice:
print("CUSTOM CHOICE", st.session_state.custom_choice)
llm = LLM()
info = llm.generate(
f'Nedan är ett polisförhör där en person omnämns som "{unverified_person.name}".\n\n{interrogation_doc["text"]}\n\nSammanfatta informationen om {unverified_person.name} på ett detaljerat sätt, var noga med namn, platser, händelser och relationer. Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat. '
)
person_in_arango = db.collection("persons").insert(
{
"_key": arango.fix_key_name(st.session_state.custom_choice),
"name": st.session_state.custom_choice,
"info": [info],
"mentioned_in_interrogation": [interrogation_doc["_key"]],
"mentioned_as": [{unverified_person.name: interrogation_doc["_key"]}],
}
)
verify(
db,
"Yes",
unverified_person.doc,
found_person.doc,
interrogation_key=interrogation_doc["_key"],
)
elif st.session_state.unconfirmed_choice:
unconfirmed_choice = st.session_state.unconfirmed_choice.replace(
"*", ""
).strip()
print_yellow("OTHER CHOICE", unconfirmed_choice)
doc = db.collection("persons").get(
st.session_state.persons_dict[unconfirmed_choice]
)
found_person = FoundPerson(
db, unconfirmed_choice, st.session_state.persons_dict[unconfirmed_choice]
)
print("NEW:", found_person.name)
verify(db, "Yes", unverified_person, found_person, interrogation_doc["_key"])
elif st.session_state.user_choice == "Ja":
print("USER CHOICE", st.session_state.user_choice)
if "mentioned_as" not in found_person.doc:
found_person.doc["mentioned_as"] = []
if not check_if_dict_in_list(
{person["name"]: interrogation_doc["_key"]},
found_person.doc["mentioned_as"],
):
found_person.doc["mentioned_as"].append(
{person["name"]: interrogation_doc["_key"]}
)
verify(
db,
answer="Yes",
person=person,
person_in_arango=found_person.doc,
interrogation_key=interrogation_doc["_key"],
)
elif st.session_state.user_choice == "Nej":
verify(
db,
"No",
unverified_person=unverified_person.doc,
found_person=found_person.doc,
interrogation_key=interrogation_doc["_key"],
)
elif st.session_state.user_choice == "Vet ej":
verify(
db,
"Unknown",
unverified_person=unverified_person.doc,
interrogation_key=interrogation_doc["_key"],
)
reset_choices()
if not param_person_key:
if st.session_state.suggestions != []:
st.session_state.suggestion = st.session_state.suggestions.pop(0)
else:
get_unverified_person()
get_suggestions(st.session_state.unverified_person.__dict__)
st.session_state.suggestion = st.session_state.suggestions.pop(0)
st.rerun()
else:
st.markdown(":green[Tack!] Du kan stäna de här fliken nu.")
st.stop()

@ -43,35 +43,34 @@ class ArangoDB:
return string return string
def get_persons(self, confirmed=True): def get_persons(self, confirmed=True):
""" """
Gets a list of all names in the database. Retrieves a list of persons from the database.
Args: Args:
confirmed (bool, optional): If True, only returns names of confirmed persons. confirmed (bool, optional): If True, only retrieves confirmed persons. Defaults to True.
If False, returns names of all persons.
Defaults to True.
Returns: Returns:
dict: A dictionary containing two lists: dict: A dictionary containing two keys:
- 'names': A list of all names in the database. - 'names': A list of person names.
- 'dict_persons': A list of dictionaries, where each dictionary contains the name and key of a person. - 'dict': A dictionary mapping person names to their corresponding keys.
""" """
confirmed_string = '' confirmed_string = ''
if confirmed: if confirmed:
confirmed_string = 'filter person.confirmed == true' confirmed_string = 'filter person.confirmed == true'
query = f""" query = f"""
FOR person IN persons FOR person IN persons
{confirmed_string} {confirmed_string}
RETURN {{'name': person.name, '_key': person._key}} RETURN {{'name': person.name, '_key': person._key}}
""" """
persons = [i for i in self.db.aql.execute(query)] persons = [i for i in self.db.aql.execute(query)]
names = [document['name'] for document in persons] names = [document['name'] for document in persons]
dict_persons = {document['name']: document['_key'] for document in persons} dict_persons = {document['name']: document['_key'] for document in persons}
return {'names': names, 'dict':dict_persons} return {'names': names, 'dict':dict_persons}
arango = ArangoDB() arango = ArangoDB()
db = arango.db
if __name__ == '__main__': if __name__ == '__main__':
arango = ArangoDB() arango = ArangoDB()
print(arango.db) print(len(arango.get_persons(confirmed=False)['names']))

@ -4,7 +4,8 @@ from chromadb.config import Settings
from chromadb.api.client import Client from chromadb.api.client import Client
from chromadb.api.models.Collection import Collection from chromadb.api.models.Collection import Collection
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
class ChromaDB: class ChromaDB:
""" """
@ -24,17 +25,21 @@ class ChromaDB:
host=host, host=host,
port=port, port=port,
) )
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction( # huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
api_key='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', # api_key="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",
model_name="KBLab/sentence-bert-swedish-cased" # model_name="KBLab/sentence-bert-swedish-cased",
) # )
self.embedding_function: embedding_functions = huggingface_ef self.embedding_function: embedding_functions = (
embedding_functions.SentenceTransformerEmbeddingFunction(
model_name="KBLab/sentence-bert-swedish-cased"
)
)
def print_collections(self): def print_collections(self):
""" """
Prints all collections in the database. Prints all collections in the database.
""" """
collections = self.client.list_collections() collections: Collection = self.client.list_collections()
for collection in collections: for collection in collections:
print(collection.name) print(collection.name)
@ -49,7 +54,9 @@ class ChromaDB:
None None
""" """
collection = self.client.get_collection("mala_persons") collection = self.client.get_or_create_collection(
"mala_persons", embedding_function=self.embedding_function
)
# Lists to store the documents, metadatas and ids # Lists to store the documents, metadatas and ids
documents = [] documents = []
@ -57,7 +64,11 @@ class ChromaDB:
ids = [] ids = []
documents.append(person["name"]) documents.append(person["name"])
metadata = {"name": person["name"], "_key": person["_key"], 'info': "\n".join(person["info"])} metadata = {
"name": person["name"],
"_key": person["_key"],
"info": "\n".join(person["info"]),
}
metadatas.append(metadata) metadatas.append(metadata)
ids.append(person["_key"]) ids.append(person["_key"])
@ -78,8 +89,10 @@ class ChromaDB:
""" """
from _arango import arango from _arango import arango
self.client.delete_collection('mala_persons') self.client.delete_collection("mala_persons")
col = self.client.get_or_create_collection('mala_persons') col = self.client.get_or_create_collection(
"mala_persons", embedding_function=self.embedding_function
)
db = arango.db db = arango.db
q = "for doc in persons filter doc.confirmed == true return doc" q = "for doc in persons filter doc.confirmed == true return doc"
@ -88,18 +101,60 @@ class ChromaDB:
for person in persons: for person in persons:
self.add_person_to_chroma(person) self.add_person_to_chroma(person)
print('Persons in chroma:', col.count()) print("Persons in chroma:", col.count())
def add_all_person_info(self):
"""
Adds all person information to the Chroma database.
"""
from _arango import arango
# Initialize the ChromaDB object try:
chroma = ChromaDB() self.client.delete_collection("mala_persons_info")
except:
pass
col = self.client.get_or_create_collection(
"mala_persons_info", embedding_function=self.embedding_function
)
if __name__ == '__main__': persons = list(arango.db.collection("persons").all())
chroma = ChromaDB() for person in persons:
chroma.add_all_persons_to_chroma() doc = person["name"] + "\n" + "\n".join(person["info"])
col.add(
documents=[doc],
metadatas=[{"name": person["name"], "_key": person["_key"]}],
ids=[person["_key"]],
)
def query(self, collection, query_texts, n_results=5, where={}):
if isinstance(query_texts, str):
query_texts = [query_texts]
col = self.client.get_collection(
collection, embedding_function=self.embedding_function
)
return col.query(query_texts=query_texts, n_results=n_results, where=where, )
def add_interrogations():
from _arango import db
from langchain_text_splitters import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=1000,
chunk_overlap=100,
length_function=len,
is_separator_regex=False,
)
interrogatons = list(db.collection('interrogations').all())
for interrogation in interrogatons:
chunks = text_splitter.split_text(interrogation['text'])
for chunk in chunks:
# Initialize the ChromaDB object
chroma = ChromaDB()
if __name__ == "__main__":
chroma.print_collections()
#chroma.add_all_persons_to_chroma()
#chroma.add_all_person_info()

@ -3,11 +3,11 @@ import requests
import concurrent.futures import concurrent.futures
import queue import queue
import threading import threading
from pprint import pprint
import re import re
from dotenv import load_dotenv from dotenv import load_dotenv
import os import os
import json
from print_color import *
load_dotenv() load_dotenv()
@ -16,11 +16,13 @@ class LLM:
def __init__( def __init__(
self, self,
chat: bool = False, chat: bool = False,
model: str = "llama3:8b-instruct-q5_K_M", model: str = os.getenv("LLM_MODEL"),
keep_alive: int = 3600 * 24, keep_alive: int = 3600 * 24,
start: bool = False, start: bool = False,
system_prompt: str = 'Svara alltid på svenska. Svara bara på det som efterfrågas. Om du inte kan svara, skriv "Jag vet inte".', system_prompt: str = 'Svara alltid på svenska. Svara bara på det som efterfrågas. Om du inte kan svara, skriv "Jag vet inte".',
temperature: str = 0, temperature: str = 0,
stream=False,
small=False
): ):
""" """
Initializes an instance of MyClass. Initializes an instance of MyClass.
@ -34,26 +36,31 @@ class LLM:
which processes requests concurrently. Defaults to False. which processes requests concurrently. Defaults to False.
""" """
self.model = model
self.server = os.getenv("LLM_URL") self.server = os.getenv("LLM_URL")
self.port = os.getenv("LLM_PORT") self.port = os.getenv("LLM_PORT")
self.model = model if small:
self.model = os.getenv("LLM_SMALL_MODEL")
self.server = os.getenv("LLM_SMALL_URL")
self.port = os.getenv("LLM_SMALL_PORT")
self.temperature = temperature self.temperature = temperature
self.system_message = {"role": "system", "content": system_prompt} self.system_message = {"role": "system", "content": system_prompt}
self.messages = [self.system_message] self.messages = [self.system_message]
self.chat = chat self.chat = chat
self.max_tokens = 24000 self.max_length = 24000
self.keep_alive = keep_alive self.keep_alive = keep_alive
self.request_queue = queue.Queue() self.request_queue = queue.Queue()
self.result_queue = queue.Queue() self.result_queue = queue.Queue()
self.all_requests_added_event = threading.Event() self.all_requests_added_event = threading.Event()
self.all_results_processed_event = threading.Event() self.all_results_processed_event = threading.Event()
self.stop_event = threading.Event() self.stop_event = threading.Event()
self.stream = stream
if start: if start:
self.start() self.start()
def generate(self, message):
def create_data_request(self, message):
# Remove leading and trailing whitespace # Remove leading and trailing whitespace
message = '\n'.join(line.strip() for line in message.split('\n')) message = '\n'.join(line.strip() for line in message.split('\n'))
@ -73,32 +80,46 @@ class LLM:
"messages": messages, "messages": messages,
"options": options, "options": options,
"keep_alive": self.keep_alive, "keep_alive": self.keep_alive,
"stream": False, "stream": self.stream,
} }
return data
def generate_stream(self, message):
# Make a POST request to the API endpoint # Make a POST request to the API endpoint
result = requests.post( data = self.create_data_request(message)
f"http://{self.server}:{self.port}/api/chat", json=data
).json()
# print_data = result.copy() response = requests.post(
# del print_data["message"] f"http://{self.server}:{self.port}/api/chat", json=data, stream=True
# del print_data["model"] )
# # Convert durations from nanoseconds to seconds # Iterate over the response
# for key in ['eval_duration', 'total_duration']: # Iterate over the response
# if key in print_data: for line in response.iter_lines():
# duration = print_data[key] / 1e9 # Convert nanoseconds to seconds # Filter out keep-alive new lines
# minutes, seconds = divmod(duration, 60) # Convert seconds to minutes and remainder seconds if line:
# print_data[key] = f'{int(minutes)}:{seconds:02.0f}' # Format as minutes:seconds decoded_line = line.decode('utf-8')
json_line = json.loads(decoded_line) # Parse the line as JSON
yield json_line['message']['content']
# pprint(print_data)
# print('Number of messages', len(messages))
if "message" in result: def generate(self, message):
answer = result["message"]["content"]
else: data = self.create_data_request(message)
pprint(result) # Make a POST request to the API endpoint
result = requests.post(
f"http://{self.server}:{self.port}/api/chat", json=data
)
try:
if 'message' in result.json():
answer = result.json()["message"]["content"]
else:
print_red(result.content)
raise 'Error occurred during API request'
except requests.exceptions.JSONDecodeError:
print_red(result.content)
raise Exception("Error occurred during API request") raise Exception("Error occurred during API request")
if self.chat: if self.chat:
@ -196,15 +217,15 @@ class LLM:
# Add the new message to the list # Add the new message to the list
self.messages.append({"role": "user", "content": message}) self.messages.append({"role": "user", "content": message})
# Calculate the total token length of the messages # Calculate the total length of the messages
total_tokens = sum([len((msg["content"])) for msg in self.messages]) total_length = sum([len((msg["content"])) for msg in self.messages])
# While the total token length exceeds the limit, remove the oldest messages # While the total length exceeds the limit, remove the oldest messages
while total_tokens > self.max_tokens: while total_length > self.max_length:
removed_message = self.messages.pop( removed_message = self.messages.pop(
1 1
) # Remove the oldest message (not the system message) ) # Remove the oldest message (not the system message)
total_tokens -= len((removed_message["content"])) total_length -= len((removed_message["content"]))
def unload_model(self): def unload_model(self):
data = { data = {
@ -228,14 +249,13 @@ if __name__ == "__main__":
parser.add_argument("--unload", action="store_true", help="Unload the model") parser.add_argument("--unload", action="store_true", help="Unload the model")
args = parser.parse_args() args = parser.parse_args()
# llm = LLM(model='llama3:70b-text-q4_K_M', keep_alive=6000, chat=True) llm = LLM(keep_alive=60000, chat=True, small=False)
llm = LLM(keep_alive=6000, chat=True)
if args.unload: if args.unload:
llm.unload_model() llm.unload_model()
else: else:
while True: while True:
message = input(">>> ") message = input(">>> ")
message = '''Hej
bad är kul'''
print(llm.generate(message)) print(llm.generate(message))

@ -0,0 +1,76 @@
from openai import OpenAI, RateLimitError
from dotenv import load_dotenv
import os
from _llm import LLM as LLM_ollama
from print_color import *
from time import sleep
load_dotenv()
class LLM_OpenAI:
def __init__(
self,
system_prompt='Svara alltid på svenska. Svara bara på det som efterfrågas. Om du inte kan svara, skriv "Jag vet inte".',
chat=False,
model="gpt-3.5-turbo-0125",
max_tokens=24000,
sleep_time=0
):
self.chat = chat
self.model = model
self.temperature=0
self.max_tokens = max_tokens
self.system_message = {"role": "system", "content": system_prompt}
self.messages =[self.system_message]
self.client = OpenAI(
# This is the default and can be omitted
api_key=os.getenv("OPEN_AI"),
)
self.llm_ollama = LLM_ollama(chat=False, stream=True) # For backup
self.sleep_time = sleep_time
def build_message(self, message):
# Add the new message to the list
self.messages.append({"role": "user", "content": message})
# Calculate the total token length of the messages
total_tokens = sum([len((msg["content"])) for msg in self.messages])
# While the total token length exceeds the limit, remove the oldest messages
while total_tokens > self.max_tokens:
removed_message = self.messages.pop(
1
) # Remove the oldest message (not the system message)
total_tokens -= len((removed_message["content"]))
def generate(self, prompt, stream=False, local=False):
sleep(self.sleep_time)
if self.chat:
self.build_message(prompt)
messages = self.messages
else:
messages = [self.system_message, {"role": "user", "content": prompt}]
print(sum([len((msg["content"])) for msg in messages]))
if local:
response = self.llm_ollama.generate_stream(prompt)
else:
try:
response = self.client.chat.completions.create(
messages=messages,
model=self.model,
stream=stream
)
except RateLimitError as e:
print_red(e)
response = self.llm_ollama.generate_stream(prompt)
if stream:
return response
else:
answer = response.choices[0].message.content
if self.chat:
self.messages.append({"role": "assistant", "content": answer})
return answer

@ -0,0 +1,91 @@
import streamlit as st
import fitz
from fitz import Page, Document
from _llm import LLM
import re
from person_identifier import PersonFinder
from print_color import *
def set_name():
st.session_state.name = st.session_state.names.pop(0)
def highlight_name_in_pdf(page: Page, name: str):
# Search for the word in the page
rectangles = page.search_for(name, quads=True)
# Highlight the found words
page.add_highlight_annot(rectangles)
# Convert the page to a pixmap
pixmap = page.get_pixmap(dpi=300)
# Save the pixmap to a new image file
pixmap.save(image_filename, 'png')
def show_image(filename):
# Display the new image file in Streamlit
st.image(filename)
def get_page(page_number):
# Create a new document
new_doc = fitz.open()
new_doc.insert_pdf(doc, from_page=page_number, to_page=page_number)
page = new_doc[0]
page.set_cropbox(fitz.Rect(0, 100, 520, 800))
return page
@st.cache_resource()
def get_extractor():
return PersonFinder()
st.set_page_config(layout="wide")
filename = "Förhörsprotokoll.pdf"
image_filename = "highlighted.png"
page_number = 89
if 'doc' not in st.session_state:
st.session_state.doc = fitz.open(filename)
doc = st.session_state.doc
page = get_page(page_number)
if 'all_names' not in st.session_state:
st.session_state.all_names = {}
if 'names' not in st.session_state or st.session_state.names is None:
person_extractor = PersonFinder(st.session_state.all_names)
st.session_state.names = person_extractor.extract_names(person_extractor, page.get_text())
st.session_state.all_names = person_extractor.names
print_blue(st.session_state.names)
print_purple(st.session_state.all_names)
names = st.session_state.names
if 'name' not in st.session_state:
st.session_state.name = names.pop(0)
name = st.session_state.name
st.markdown(f'#### {name}')
highlight_name_in_pdf(page, name)
col1, col2 = st.columns([5,2])
with col1:
show_image(image_filename)
with col2:
next = st.button("Next", on_click=set_name)
if next:
if len(names) == 0:
st.session_state.names = None
else:
highlight_name_in_pdf(page, st.session_state.name)

@ -0,0 +1,43 @@
from _arango import db
from _llm import LLM
from langchain_text_splitters import CharacterTextSplitter
from print_color import *
interrogations = list(db.aql.execute('for doc in interrogations filter doc.formatted_text == null return doc', count=True))
text_splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=2000,
chunk_overlap=0,
length_function=len,
is_separator_regex=False,
)
for interrogation in interrogations:
text = interrogation['text']
chunks = text_splitter.split_text(text)
formated_chunks = []
for chunk in chunks:
print_yellow(len(chunk))
llm = LLM(chat=False, system_prompt='Du formaterar text enligt med markdown för att göra den lättare att läsa. Använd inte rubriker, bara fet och stil. Om det förekommer en dialog fetmarkera den som talar, exempelvis ** DH: **. Namn ska göras fetade, även om det bara är ett förnamn. Svara alltid med EXAKT samma text som du fick, men formaterad. Svara alltid på svenska.')
if 'name' not in interrogation:
interrogation['name'] = interrogation['person']
name = interrogation['name']
prompt = f'''Kolla på texten nedan: \n\n\n{chunk}\n\n\nJag vill att du svarar med EXAKT samma text, men formaterad enligt markdown för att vara enklare att läsa. Formatera enligt följande:
- Använd aldrig rubriker (#)
- Om det är en längre dialog mellan förhörsledare (FL) och den hörde (DH) formatera dem med fetstil, exempelvis **DH: **.
- Gör namn personer fetade, även om det bara är ett förnamn. Den förhörde {name} ska inte vara fetad utan normal text.
Ibland är styckeindelningen inte korrekt, försök att göra det lättare att läsa.
Svara ENBART med den formaterade texten, ingenting annat.'''
formatted_chunk = llm.generate(prompt)
print_blue(formatted_chunk)
formated_chunks.append(formatted_chunk)
formatted_text = '\n '.join(formated_chunks)
interrogation['formatted_text'] = formatted_text
db.collection('interrogations').update(interrogation, check_rev=False)

@ -1,4 +1,8 @@
from _arango import arango from _arango import arango
from _chroma import ChromaDB
from langchain_text_splitters import CharacterTextSplitter
from print_color import *
from _llm import LLM
def truncate(): def truncate():
arango.db.collection("other_persons").truncate() arango.db.collection("other_persons").truncate()
@ -8,55 +12,63 @@ def truncate():
def clear_info_persons(): def clear_info_persons():
persons = list(arango.db.collection("persons").all()) persons = list(arango.db.collection("persons").all())
for person in persons: for person in persons:
if 'other' in person: if not person['confirmed']:
if person['other']: arango.db.collection("persons").delete(person)
arango.db.collection('persons').delete(person) continue
continue person["info"] = []
person['info'] = [] person["mentioned_in_interrogation"] = []
person['mentioned_in_interrogation'] = [] person["mentioned_as"] = {}
arango.db.collection('persons').update(person, merge=False) arango.db.collection("persons").update(person, merge=False)
def clear_changer_interrogations(): def clear_changer_interrogations():
interrogations = list(arango.db.collection("interrogations").all()) interrogations = list(arango.db.collection("interrogations").all())
for interrogation in interrogations: for interrogation in interrogations:
interrogation['mentioned_persons'] = [] interrogation["mentioned_persons"] = []
arango.db.collection('interrogations').update(interrogation, merge=False) arango.db.collection("interrogations").update(interrogation, merge=False)
def clean_mentioned_as(): def clean_mentioned_as():
persons = list(arango.db.collection("persons").all()) persons = list(arango.db.collection("persons").all())
for person in persons: for person in persons:
if 'mentioned_as' in person: if "mentioned_as" in person:
mentioned_as = [] mentioned_as = []
for i in person['mentioned_as']: for i in person["mentioned_as"]:
if i not in mentioned_as: if i not in mentioned_as:
mentioned_as.append(i) mentioned_as.append(i)
person['mentioned_as'] = mentioned_as person["mentioned_as"] = mentioned_as
arango.db.collection('persons').update(person, merge=False) person['info'] = []
arango.db.collection("persons").update(person, merge=False)
db = arango.db
cursor = db.aql.execute('for doc in rumors return doc')
rumors = list(cursor)
for rumor in rumors:
rumor['class'] = rumor['class'].replace('.', '').strip().lower()
db.collection('rumors').update(rumor, merge=False)
text_splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=1000,
chunk_overlap=100,
length_function=len,
is_separator_regex=False,
)
db = arango.db
# persons = list(arango.db.collection("persons").all()) interrogations = list(db.aql.execute('for doc in interrogations filter doc.person_mentioned_as == null return doc'))
pms = list(db.aql.execute('for doc in pms return {"_id": doc._id, "page": doc.page}'))
# for person in persons: interrogations = interrogations + pms
# for interrogation in person['interrogations']:
# arango.db.collection('interrogations').update({'_key': interrogation, 'person_id': person['_id']}, )
# print(f"Updated {interrogation} with person_id {person['_id']}")
interrogations.sort(key=lambda x: x['page'])
# interrogations = list(arango.db.collection("interrogations").all()) for i in interrogations:
llm = LLM(chat=False)
if 'text' not in i:
continue
text = i['text'][:1000]
print_purple(text)
name = i['name']
prompt = f'''Nedan är ett förhör med {name}: \n\n\n{text}\n\n\nOm du ser till själva förhöret, vilket namn används för {name}? Om personen exempelvis bara skrivs ut med förnamn så vara med det. Svara ENBART med namnet, inget annat.'''
answer = llm.generate(prompt)
i['person_mentioned_as'] = answer
db.collection('interrogations').update(i, check_rev=False)
# for interrogation in interrogations:
# interrogation['person_id'] = 'persons/' + interrogation['person'].replace('persons_', '')
# arango.db.collection('interrogations').update(interrogation, merge=False)

@ -3,6 +3,8 @@ import fitz
from _arango import arango from _arango import arango
from openai import OpenAI from openai import OpenAI
from pprint import pprint from pprint import pprint
from print_color import *
class Section: class Section:
def __init__(self, type, page, filename="Förhörsprotokoll.pdf"): def __init__(self, type, page, filename="Förhörsprotokoll.pdf"):
@ -15,14 +17,17 @@ class Section:
self.date = "" self.date = ""
self.start_page = page self.start_page = page
self.key = "" self.key = ""
self.llm = LLM(chat=True)
def add_to_arango(self): def add_to_arango(self):
key = arango.fix_key_name( # prompt = f'Sammanfatta texten nedan: \n\n """{self.text[:7000]}""" \n\nSammanfattning:'
f"{self.person}_{self.date}_p.{self.start_page}" # llm = LLM(chat=False)
) # self.summary = llm.generate(prompt)
# print_green(self.summary)
arango_doc = { arango_doc = {
"_key": key, "_key": self.key,
"person": self.person, "person": self.person,
"role": self.role, "role": self.role,
"topic": self.topic, "topic": self.topic,
@ -30,45 +35,65 @@ class Section:
"page": self.start_page, "page": self.start_page,
"text": self.text, "text": self.text,
"filename": self.filename, "filename": self.filename,
#"summary": self.summary,
"name": self.name,
} }
arango.db.collection(self.type).insert(arango_doc, overwrite=True) arango.db.collection(self.type).insert(arango_doc, overwrite_mode="update")
print(f"Added {self.type} to ArangoDB with key {key}") print(f"Added {self.type} to ArangoDB with key {self.key}")
def extract_interrogation(self, text): def extract_interrogation(self, text):
self.person = llm.generate( self.person = self.llm.generate(
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vem är förhörd? Svara på formen "Förnamn Efternamn" \n\nFörhörd person:' f'Kolla på texten nedan: \n\n """{text}""" \n\n Vem är förhörd? Namnet står ofta på formen "Efternamn, Förnamn". Jag vill att su ska svara på formen "Förnamn Efternamn"'
) )
self.role = llm.generate( number_of_names = len(self.person.split(' '))
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vem är {self.person}? \n\nTitel på förhörd person:' if number_of_names != 2:
first_name = self.llm.generate(
f'Personens formella namn är alltså {self.person}. Om du kollar på själva föhörstexten, är personens förnamn (det som personen kallas för)? Svara bara med ett förnamn.'
)
last_name = self.llm.generate(
f'Och i efternamn?'
)
self.name = f'{first_name} {last_name}'
print_rainbow(self.person, '->', self.name)
else:
self.name = self.person
self.role = self.llm.generate(
f'Vem är {self.person}? Svara så kort som möjligt med titel eller beskrivning.'
) )
self.topic = llm.generate( self.topic = self.llm.generate(
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vad handlade förhöret om? Svara så kortfattat som möjligt. \n\nFörhörets syfte:' f'Vad handlade förhöret om? Svara så kortfattat som möjligt.'
) )
self.date = llm.generate( self.date = self.llm.generate(
f'Kolla på texten nedan: \n\n """{text}""" \n\n När ägde förhöret rum? Svara på formen YYY-MM-DD \n\nFörhörsdatum:' f'När ägde förhöret rum? Svara på formen YYY-MM-DD'
) )
cursor = arango.db.aql.execute(f'for doc in interrogations filter doc.page == {self.start_page} return doc._key', count=True)
self.key = arango.fix_key_name(f"{self.person}_{self.date}_p.{self.start_page}") if cursor.count() == 1:
self.key = cursor.next()
else:
print_red("Could not find key")
self.key = arango.fix_key_name(f"{self.person}_{self.date}_p.{self.start_page}")
def extract_pm(self, text): def extract_pm(self, text):
self.person = llm.generate( self.person = self.llm.generate(
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vem är uppgiftslämnare? Svara på formen "Förnamn Efternamn" \n\nPM:' f'Kolla på texten nedan: \n\n """{text}""" \n\n Vem är uppgiftslämnare? Svara på formen "Förnamn Efternamn" \n\nPM:'
) )
self.role = llm.generate( self.role = self.llm.generate(
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vem är {self.person}? Svara "None" om det inte framgår. \n\nTitel på person:' f'Kolla på texten nedan: \n\n """{text}""" \n\n Vem är {self.person}? Svara "None" om det inte framgår. \n\nTitel på person:'
) )
self.topic = llm.generate( self.topic = self.llm.generate(
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vad handlade informationen om? Svara så kortfattat som möjligt. Svara "None" om det inte framgår. \n\Svar:' f'Kolla på texten nedan: \n\n """{text}""" \n\n Vad handlade informationen om? Svara så kortfattat som möjligt. Svara "None" om det inte framgår. \n\Svar:'
) )
self.date = llm.generate( self.date = self.llm.generate(
f'Kolla på texten nedan: \n\n """{text}""" \n\n När lämnades informationen? Svara på formen YYY-MM-DD \n\nDatum:' f'Kolla på texten nedan: \n\n """{text}""" \n\n När lämnades informationen? Svara på formen YYY-MM-DD \n\nDatum:'
) )
self.key = arango.fix_key_name(f"{self.person}_{self.date}_p.{self.start_page}") self.key = arango.fix_key_name(f"{self.person}_{self.date}_p.{self.start_page}")
def new_interrogation(page, section): def new_interrogation(page, section):
if section.text != "": if section.text != "":
@ -85,25 +110,7 @@ def new_pm(page, section):
section.extract_interrogation(page.get_text()) section.extract_interrogation(page.get_text())
return section return section
def is_new_interrogation(page_text, page):
# * Llama
llm = LLM(chat=False, model="llama3:8b-instruct-q5_K_M")
# Open the PDF file
filename = "Förhörsprotokoll.pdf"
area = fitz.Rect(0, 40, 520, 800) # To exlude the header
doc = fitz.open(f"pdfs/{filename}")
section = Section("interrogations", 0)
for page in doc.pages(9, len(doc) - 1):
# Get the text from the page
page_text = page.get_text("text")
# Check if there is a new interrogation # Check if there is a new interrogation
control_words_interrogation = [ control_words_interrogation = [
"Förhörsdatum", "Förhörsdatum",
@ -118,31 +125,58 @@ for page in doc.pages(9, len(doc) - 1):
if word in page_text: if word in page_text:
n_control_words_interrogation += 1 n_control_words_interrogation += 1
if n_control_words_interrogation >= 2: if n_control_words_interrogation >= 2:
section = new_interrogation(page, section) print_purple('New interrogation', page)
area = fitz.Rect(0, 400, 520, 800) return True
else: def is_new_pm(page_text, page):
# Check if there is a new PM control_words_pm = [
control_words_pm = [
"PM", "PM",
"Uppgiften avser", "Uppgiften avser",
"Upprättad av", "Upprättad av",
"Sätt på vilket uppgift lämnats", "Sätt på vilket uppgift lämnats",
"Uppgiftslämnare", "Uppgiftslämnare",
] ]
n_control_words_pm = 0 n_control_words_pm = 0
for word in control_words_pm: for word in control_words_pm:
if word in page_text: if word in page_text:
n_control_words_pm += 1 n_control_words_pm += 1
if n_control_words_pm >= 2: if n_control_words_pm >= 2:
area = fitz.Rect(0, 400, 520, 800) print_blue('New PM', page)
section = new_pm(page, section) return True
else:
# It's a "normal" page # * Llama
area = fitz.Rect(0, 40, 520, 800) # To exlude the header
# Open the PDF file
filename = "Förhörsprotokoll.pdf"
area = fitz.Rect(0, 40, 520, 800) # To exlude the header
doc = fitz.open(f"/home/lasse/mala/Förhörsprotokoll.pdf")
section = Section("interrogations", 0)
for page in doc.pages(9, len(doc) - 1):
# Get the text from the page
page_text = page.get_text("text")
if is_new_interrogation(page_text, page):
section = new_interrogation(page, section)
area = fitz.Rect(0, 400, 520, 800)
elif is_new_pm(page_text, page):
# Check if there is a new PM
area = fitz.Rect(0, 400, 520, 800)
section = new_pm(page, section)
else:
# It's a "normal" page
area = fitz.Rect(0, 40, 520, 800) # To exlude the header
blocks = page.get_text("blocks", clip=area) blocks = page.get_text("blocks", clip=area)
for block in blocks: for block in blocks:
section.text += block[4] + "\n\n" section.text += block[4] + "\n\n"
new_interrogation(page, section)

@ -1,27 +1,49 @@
import multiprocessing import multiprocessing
from _llm import LLM from _llm import LLM as LLM_garda
from _openai import LLM_OpenAI as LLM
#from _llm import LLM
from _arango import arango from _arango import arango
from langchain_text_splitters import CharacterTextSplitter from langchain_text_splitters import CharacterTextSplitter
import difflib import difflib
import re import re
import random import random
from time import sleep from time import sleep
import traceback
from pprint import pprint from pprint import pprint
from print_color import * from print_color import *
class Interrogation:
def __init__(self, _key, text):
self._key = _key
self.text = text
self.mentioned_persons = []
self.chunks = None
def check_name(person, answer_person, text): def check_name(person, answer_person, text):
print_yellow(person, " - ", answer_person) print_yellow(person, " - ", answer_person)
same = False same = False
# If full name similarity is below a threshold (e.g., 0.5), compare first names only # If full name similarity is below a threshold (e.g., 0.5), compare first names only
# If person only has one name, first or last, compare that to first and last name of answer_person # If person only has one name, first or last, compare that to first and last name of answer_person
print('Length person:', len(person.strip().split()))
if len(person.strip().split()) == 1: if len(person.strip().split()) == 1:
llm = LLM() llm = LLM()
answer_first_name = answer_person.split()[0].strip() answer_first_name = answer_person.split()[0].strip()
answer_last_name = answer_person.split()[-1].strip() answer_last_name = answer_person.split()[-1].strip()
first_name_similarity = difflib.SequenceMatcher(
None, person, answer_first_name
).ratio()
last_name_similarity = difflib.SequenceMatcher(
None, person, answer_last_name
).ratio()
print("First name similarity:", first_name_similarity)
print("Last name similarity:", last_name_similarity)
if difflib.SequenceMatcher(None, person, answer_first_name).ratio() > 0.9: if difflib.SequenceMatcher(None, person, answer_first_name).ratio() > 0.9:
if answer_last_name in text: if answer_last_name in text:
same = True same = True
@ -31,11 +53,13 @@ def check_name(person, answer_person, text):
i["name"].split()[0] for i in db.collection("persons").all() i["name"].split()[0] for i in db.collection("persons").all()
] ]
first_name_count = first_names.count(answer_first_name) first_name_count = first_names.count(answer_first_name)
print("First name count:", first_name_count)
if first_name_count == 1: if first_name_count == 1:
same = True same = True
else: else:
llm = LLM_garda()
answer = llm.generate( answer = llm.generate(
f'Nämns någon med efternamnet "{answer_last_name}" i texten nedan?\n\n"""{text[:6000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"' f'Nämns någon med efternamnet "{answer_last_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"'
) )
if "JA" in answer: if "JA" in answer:
same = True same = True
@ -44,8 +68,9 @@ def check_name(person, answer_person, text):
if answer_first_name in text: if answer_first_name in text:
same = True same = True
else: else:
llm = LLM_garda()
answer = llm.generate( answer = llm.generate(
f'Nämns någon med förnamnet "{answer_first_name}" i texten nedan?\n\n"""{text[:6000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"' f'Nämns någon med förnamnet "{answer_first_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"'
) )
if "JA" in answer: if "JA" in answer:
same = True same = True
@ -53,18 +78,9 @@ def check_name(person, answer_person, text):
else: else:
name_similarity = difflib.SequenceMatcher(None, person, answer_person).ratio() name_similarity = difflib.SequenceMatcher(None, person, answer_person).ratio()
print("Similarity:", name_similarity) print("Similarity:", name_similarity)
# person_first_name = person.split()[0]
# answer_person_first_name = answer_person.split()[0]
# first_name_similarity = difflib.SequenceMatcher(
# None, person_first_name, answer_person_first_name
# ).ratio()
# person_last_name = person.split()[-1]
# answer_person_last_name = answer_person.split()[-1]
# print("new:", name_similarity) if name_similarity > 0.85:
if name_similarity > 0.9:
same = True same = True
return same return same
@ -86,55 +102,50 @@ def execute_query_with_retry(db, query, max_retries=5, delay=2):
# Then, in your extract_persons function: # Then, in your extract_persons function:
def extract_persons(interrogation): def extract_persons(interrogation, names_interrogation):
known_persons = {
"Douglas": "Douglas Bengtsson",
"Rashid": "Rashid Sheiksaid",
"Emanuel": "Emanuel Johansson",
"Robert": "Robert Bengtsson",
}
sleep(random.uniform(0.05, 0.3))
print("INTERROGATION:", interrogation["_key"])
q = "for doc in persons filter doc.other != true return doc"
result = execute_query_with_retry(db, q)
persons_docs = list(result)
persons = [i["name"].strip() for i in persons_docs]
first_names = {i["name"].split()[0].strip(): i["name"] for i in persons_docs}
persons_dict = {i["name"]: i for i in persons_docs}
text_splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=4000,
chunk_overlap=0,
length_function=len,
is_separator_regex=False,
)
chunks = text_splitter.split_text(interrogation["text"])
llm = LLM( llm = LLM(
chat=True, chat=True,
system_prompt="Du är en assistent som hjälper till att hitta personer i ett polisförhör. Du får en del av texten från förhöret åt gången. Svara bara när personen finns i den del du får, hitta inte på personer.", system_prompt="Du är en assistent som hjälper till att hitta personer i ett polisförhör. Svara bara när personen finns i den del du får, hitta inte på personer.",
) )
names = [] names = []
for chunk in chunks: # Find persons in the text
# Find persons in the text prompt = f'''Det här är en text från ett polisförhör där {interrogation["person"]} förhörs:\n
prompt = f'''Det här är en text från ett polisförhör där {interrogation["person"]} förhörs:\n """{chunk}"""\n
"""{chunk}"""\n Vilka personer nämns i texten? Svara ENBART med en pythonformaterad lista av namn.
Vilka personer nämns i texten som inte förekommit tidigare? Svara ENBART med en pythonformaterad lista av namn. Exempel svar för att du ska förstå formen: "["namn1", "namn2", "namn3"]".
Exempel svar för att du ska förstå formen: "["namn1", "namn2", "namn3"]". Jag är inte intresserad av förhörsledaren eller personen som förhörs.'''
Jag är inte intresserad av förhörsledaren eller personen som förhörs.''' response = llm.generate(prompt)
response = llm.generate(prompt) response = re.sub(r"[^a-zA-ZåäöÅÄÖ\- ,]", "", response).replace(" namn ", "")
response = re.sub(r"[^a-zA-ZåäöÅÄÖ\- ,]", "", response).replace(" namn ", "")
for name in [i.strip() for i in response.split(",") if len(i) > 2]:
for name in [i.strip() for i in response.split(",") if len(i) > 2]: if name not in names and name not in names_interrogation:
if name not in names: same_name = False
if names_interrogation != []:
for name_interrogation in list(names_interrogation):
if name in name_interrogation:
same_name = True
names_interrogation[name] = names_interrogation[name_interrogation]
person_arango = db.aql.execute('for doc in persons filter doc.name == @name return doc', bind_vars={'name': names_interrogation[name_interrogation]}, count=True)
if person_arango:
person_arango = list(person_arango)[0]
if interrogation["_key"] not in person_arango["mentioned_as"]:
person_arango["mentioned_as"][interrogation["_key"]] = [name]
else:
if name not in person_arango["mentioned_as"][interrogation["_key"]]:
person_arango["mentioned_as"][interrogation["_key"]].append(name)
db.collection("persons").update(person_arango, check_rev=False)
if not same_name:
names.append(name) names.append(name)
else:
print_green('Name already in names_interrogation', name)
return names, names_interrogation
def identify_persons(names, chunk, names_interrogation):
for name in names: for name in names:
print_blue('New name:', name)
# Compare the person to a list of known persons # Compare the person to a list of known persons
prompt = f'''Jag vill veta vem {name} är. Kolla på förhöret nedan och svara om du hittar något om personen där. prompt = f'''Jag vill veta vem {name} är. Kolla på förhöret nedan och svara om du hittar något om personen där.
"""{chunk}"""\n """{chunk}"""\n
@ -151,6 +162,7 @@ def extract_persons(interrogation):
elif name.split().reverse() in persons: elif name.split().reverse() in persons:
print("Vände och hittade ✌", name.split().reverse()) print("Vände och hittade ✌", name.split().reverse())
person = persons_dict[name.split().reverse()] person = persons_dict[name.split().reverse()]
else: else:
closest_matches = difflib.get_close_matches(name, persons, n=4, cutoff=0.3) closest_matches = difflib.get_close_matches(name, persons, n=4, cutoff=0.3)
@ -160,8 +172,8 @@ def extract_persons(interrogation):
persons_string = "\n".join(closest_matches) persons_string = "\n".join(closest_matches)
prompt = f"""Jag behöver identifiera {name}. Nedan är en lista på personer det kanske skulle kunna vara:\n prompt = f"""Jag behöver identifiera {name}. Nedan är en lista på personer det kanske skulle kunna vara:\n
{persons_string}\n {persons_string}\n
Är {name} någon av dessa personer? I texten kan personen stå med bara sitt förnamn eller efternamn, kolla speciellt efter namn i listan där förnamnet eller efternamnet stämmer. Namnet kan också vara felstavat, men inte ett helt annat namn. Är {name} någon av dessa personer? I texten kan personen stå med bara sitt förnamn eller efternamn, kolla speciellt efter namn i listan där förnamnet eller efternamnet stämmer. Namnet i förhöret kan också vara felstavat, exempelvis ett s istället för två eller kan bokstäver ha bytt plats, men inte ett helt annat namn.
Svara BARA med namnet personen ur listan. Är du inte säker svara "None".""" Svara BARA med namnet personen ur listan. Är det inte någon av personerna i listan svara "None"."""
answer_person = llm.generate(prompt) answer_person = llm.generate(prompt)
if answer_person in persons and check_name( if answer_person in persons and check_name(
@ -169,35 +181,35 @@ def extract_persons(interrogation):
): ):
person = persons_dict[answer_person] person = persons_dict[answer_person]
else:
print_red(f"""Answer "{answer_person}" not in persons""")
if person: if person:
print_green(f'{name} identified: {person["name"]}', "\n") if name not in names_interrogation:
names_interrogation[name] = person['name']
print_green(f'{name} identified: {person["name"]}', "\n")
if "info" not in person: if "info" not in person:
person["info"] = [] person["info"] = []
if info not in person["info"]: if info not in person["info"]:
person["info"].append(info) person["info"].append(info)
if "mentioned_in_interrogation" not in person:
person["mentioned_in_interrogation"] = [] if interrogation["_key"] not in person["mentioned_as"]:
person["mentioned_as"][interrogation["_key"]] = [name]
else:
if name not in person["mentioned_as"][interrogation["_key"]]:
person["mentioned_as"][interrogation["_key"]].append(name)
if interrogation["_key"] not in person["mentioned_in_interrogation"]: if interrogation["_key"] not in person["mentioned_in_interrogation"]:
person["mentioned_in_interrogation"].append(interrogation["_key"]) person["mentioned_in_interrogation"].append(interrogation["_key"])
if "mentioned_as" not in person:
person["mentioned_as"] = []
if {name: interrogation["_key"]} not in person["mentioned_as"]:
person["mentioned_as"].append({name: interrogation["_key"]})
db.collection("persons").update(person, check_rev=False) db.collection("persons").update(person, check_rev=False)
# db.collection("all_relations").insert(
# { # If the person was not identified as a confirmed person, add to the unconfirmed persons
# "_from": interrogation["person_id"],
# "_to": person["_id"],
# "relation": "mentioned_by",
# "mentions": [{'interrogation': interrogation["_key"], "date": interrogation["date"], "mentioned_as": info}],
# "_key": f'{interrogation["_key"]}_{person["_key"]}'
# },
# overwrite_mode="update",
# merge=True,
# )
else: else:
if name not in names_interrogation:
names_interrogation[name] = name
print(f"\033[91m{name} not identified\033[0m") print(f"\033[91m{name} not identified\033[0m")
print_yellow( print_yellow(
"\n".join([f"- {i}" for i in persons_string.split("\n")]), "\n" "\n".join([f"- {i}" for i in persons_string.split("\n")]), "\n"
@ -207,45 +219,41 @@ def extract_persons(interrogation):
_key = arango.fix_key_name(name) #TODO Are there multiple persons with the same name? _key = arango.fix_key_name(name) #TODO Are there multiple persons with the same name?
# If no confirmed person was identified, create a new person or add to another unconfirmed person # If no confirmed person was identified, create a new person or add to another unconfirmed person
if not db.collection("persons").get(_key): doc = db.collection("persons").get(_key)
if db.collection("persons").get(_key): if doc:
doc = db.collection("persons").get(_key) if interrogation["_key"] not in doc["mentioned_as"]:
doc["mentioned_as"][interrogation["_key"]] = [name]
else: else:
doc = { if name not in doc["mentioned_as"][interrogation["_key"]]:
"_key": _key, doc["mentioned_as"][interrogation["_key"]].append(name)
"name": name,
"info": [info],
"other": True,
"confirmed": False,
"mentioned_in_interrogation": [interrogation["_key"]],
}
else:
doc = db.collection("persons").get(_key)
if interrogation["_key"] not in doc["mentioned_in_interrogation"]: if interrogation["_key"] not in doc["mentioned_in_interrogation"]:
doc["mentioned_in_interrogation"].append(interrogation["_key"]) doc["mentioned_in_interrogation"].append(interrogation["_key"])
if info not in doc["info"]: if info not in doc["info"]:
doc["info"].append(info) doc["info"].append(info)
else:
doc = {
"_key": _key,
"name": name,
"info": [info],
"confirmed": False,
"mentioned_in_interrogation": [interrogation["_key"]],
"mentioned_as": {interrogation["_key"]: [name]},
}
db.collection("persons").insert(doc, merge=False, overwrite_mode='update') db.collection("persons").insert(doc, merge=False, overwrite_mode='update')
# db.collection("all_relations").insert( if person and person['_key'] not in interrogation["mentioned_persons"]:
# { interrogation["mentioned_persons"].append(person['_key'])
# "_from": interrogation["person_id"], db.collection("interrogations").update(interrogation, check_rev=False)
# "_to": doc["_id"],
# "relation": "mentioned_by",
# 'other': True,
# "mentions": [{'interrogation': interrogation["_key"], "date": interrogation["date"], "mentioned_as": info}],
# "_key": f'{interrogation["_key"]}_{doc["_key"]}'
# },
# overwrite_mode="update",
# merge=True,
# )
if __name__ == "__main__": if __name__ == "__main__":
db = arango.db db = arango.db
q = 'for doc in interrogations filter doc._key == "Markus_Lindahl_2023-02-20_p.93" return doc' q = 'for doc in interrogations return doc'
interrogations = list(db.aql.execute(q)) interrogations = list(db.aql.execute(q))
interrogations.sort(key=lambda x: x["date"]) interrogations.sort(key=lambda x: x["date"])
@ -276,9 +284,41 @@ if __name__ == "__main__":
# rumors = list(db.aql.execute(q)) # rumors = list(db.aql.execute(q))
# interrogations = [interrogation for interrogation in interrogations if interrogation['_key'] not in rumors] # interrogations = [interrogation for interrogation in interrogations if interrogation['_key'] not in rumors]
# print('Number of interrogations to process:', len(interrogations)) # print('Number of interrogations to process:', len(interrogations))
print(len(interrogations))
for interrogation in interrogations: for interrogation in interrogations:
extract_persons(interrogation) names_interrogation = {}
known_persons = {
"Douglas": "Douglas Bengtsson",
"Rashid": "Rashid Sheiksaid",
"Emanuel": "Emanuel Johansson",
"Robert": "Robert Bengtsson",
"Marlene": "Marlene Ahlqvist",
"Jhonny": "Jhonny Backman",
}
sleep(random.uniform(0.05, 0.3))
print("INTERROGATION:", interrogation["_key"])
q = "for doc in persons filter doc.confirmed == true return doc"
result = execute_query_with_retry(db, q)
persons_docs = list(result)
persons = [i["name"].strip() for i in persons_docs]
first_names = {i["name"].split()[0].strip(): i["name"] for i in persons_docs}
persons_dict = {i["name"]: i for i in persons_docs}
text_splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=8000,
chunk_overlap=0,
length_function=len,
is_separator_regex=False,
)
chunks = text_splitter.split_text(interrogation["text"])
for chunk in chunks:
names = extract_persons(interrogation)
exit() exit()
with multiprocessing.Pool(processes=3) as pool: with multiprocessing.Pool(processes=3) as pool:
pool.map(extract_persons, interrogations) pool.map(extract_persons, interrogations)

@ -5,17 +5,100 @@ from pprint import pprint
from pprint import pprint from pprint import pprint
from langchain_text_splitters import CharacterTextSplitter from langchain_text_splitters import CharacterTextSplitter
import multiprocessing import multiprocessing
from print_color import *
def describe_relation(person1, person2, relation, text):
llm = LLM(chat=False, system_prompt="Du ska hitta relationer i en text. Svara alltid enligt angiven form och alltid på svenska.")
prompt = f'''
I texten nedan beskrivs att {person1} och {person2} har relationen "{relation}". Läs texten och sammanfatta kortfattat vad som beskrivs om relationen mellan {person1} och {person2}:\n\n"""{text}"""\n
Svara ENBART med information om relationen, inga hälsningsfraser eller liknande.
Relationen ska vara kortfattad och stämma med texten. Om det inte går att beskriva relationen svara med "None".
Vad står det om relationen "{relation}" mellan {person1} och {person2}?
'''
response = llm.generate(prompt)
print_rainbow(relation, response)
return response
def find_relations(interrogation):
"""
Finds the relations in an interrogation.
Args:
interrogation (dict): The interrogation.
Returns:
None
"""
text = interrogation["text"]
text_splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=6000,
chunk_overlap=0,
length_function=len,
is_separator_regex=False,
)
chunks = text_splitter.split_text(text)
all_relations = []
llm = LLM(chat=False, system_prompt="Du ska hitta relationer i en text. Svara alltid enligt angiven form och alltid på svenska.")
for chunk in chunks:
prompt = f"""Nedan är en bit av ett polisförhör med {interrogation['person']}. Jag vill att du hittar alla relationer mellan identifierbara personer som beskrivs i själva förhöret:\n\n{chunk}\n\n
Svara formen "person1;person2;relation\n". Var noga med hur semikolon används för att skilja personerna och relationen, och ny rad efter varje relation (informationen ska sedan användas för en CSV fil). Svara svenska.
Nedan är ett påhittat exempel för att du ska förstå hur du kan svara:
<exempel>
person1;person2;gick grundskolan tillammans, spelade fotboll
person2;person3;gifta sedan 2022
</exempel>
Beskrivningen av relationen ska vara kortfattad och stämma med texten.
Om det inte finns någon relation, svara "None".
Svara ENBART med relationerna, INGENTING annat som en hälsning eller förklaring.
"""
response = llm.generate(prompt)
print_blue(response)
relations = response.split("\n")
for relation in relations:
if relation == "None" or ';' not in relation:
continue
try:
person1, person2, relation = relation.split(";", 2)
except ValueError as e:
print_red(f"Error: {e}")
print_red(f"Relation: {relation}")
continue
description = describe_relation(person1, person2, relation, chunk)
for r in all_relations:
p1 = r["from"]
p2 = r["to"]
if p1 == person1 and p2 == person2:
r["relations"].append({'relation': relation, 'description': description, 'chunk_number': chunks.index(chunk)})
break
else:
all_relations.append(
{
"_key": arango.fix_key_name(f"{person1}-{person2}_{interrogation['_key']}"),
"from": person1,
"to": person2,
"relations": [{'relation': relation, 'description': description, 'chunk_number': chunks.index(chunk)}],
'interrogation': interrogation['_key'],
"chunks": chunks,
}
)
return all_relations
if __name__ == "__main__": if __name__ == "__main__":
db = arango.db db = arango.db
q = 'for doc in interrogations return doc' q = 'for doc in interrogations return doc'
interrogations = list(db.aql.execute(q)) interrogations = list(db.aql.execute(q))
for interrogation in interrogations:
relations = find_relations(interrogation)
db.collection('relations').insert_many(relations)
with multiprocessing.Pool() as pool:
results = pool.map(process_interrogation, interrogations)

@ -1,17 +1,25 @@
import re import re
from _arango import arango from _arango import arango
from _llm import LLM from _llm import LLM as LLM
from _openai import LLM_OpenAI as LLM
from pprint import pprint from pprint import pprint
from langchain_text_splitters import CharacterTextSplitter from langchain_text_splitters import CharacterTextSplitter
import multiprocessing import multiprocessing
from print_color import * from print_color import *
from time import sleep
class Interrogation: class Interrogation:
def __init__(self, interrogation): def __init__(self, interrogation):
self.interrogation = interrogation self.interrogation = interrogation
self.llm = LLM(chat=True) self.llm = LLM(
self.llm_checker = LLM(chat=False) chat=True,
system_prompt="Du hjälper en grävande journalist att analysera text. Granskningen handlar bland annat om sexuella övergrepp och olämpligheter, men även om andra typer av brott. Du ska svara på frågor om texten och hjälpa till att sammanfatta innehållet. Svara bara på det som efterfrågas. Om du inte kan svara, skriv 'Jag vet inte'. Om du inte kan hantera vissa typer av sexuellt innehåll, försök ändå då det är viktigt att allt innehåll granskas.",
sleep_time=20
)
self.llm_checker = LLM(
chat=False,
system_prompt="Du hjälper en grävande journalist att analysera text. Granskningen handlar bland annat om sexuella övergrepp och olämpligheter, men även om andra typer av brott. Du ska svara på frågor om texten och hjälpa till att sammanfatta innehållet. Svara bara på det som efterfrågas. Om du inte kan svara, skriv 'Jag vet inte'. Om du inte kan hantera vissa typer av sexuellt innehåll, försök ändå då det är viktigt att allt innehåll granskas.",
)
self.text = interrogation["text"] self.text = interrogation["text"]
# Info to collect # Info to collect
@ -72,6 +80,7 @@ class Interrogation:
"""{chunk}""" """{chunk}"""
Jag vill veta om någonting i förhöret handlar om eller anspelar något av: Jag vill veta om någonting i förhöret handlar om eller anspelar något av:
- Sexuella olämpligheter - Sexuella olämpligheter
- Sexuella inviter - Sexuella inviter
- Övergrepp - Övergrepp
@ -292,8 +301,8 @@ if __name__ == "__main__":
] ]
print("Number of interrogations to process:", len(interrogations)) print("Number of interrogations to process:", len(interrogations))
# for i in interrogations: for i in interrogations:
# process_interrogation(i) process_interrogation(i)
# exit() # exit()
with multiprocessing.Pool(3) as pool: # with multiprocessing.Pool(3) as pool:
pool.map(process_interrogation, interrogations) # pool.map(process_interrogation, interrogations)

@ -0,0 +1,26 @@
from _arango import db
from _llm import LLM
from print_color import *
relations = list(db.aql.execute('for doc in relations return doc', count=True))
for relation in relations:
interrogation = db.collection('interrogations').get(relation['interrogation'])
if not interrogation:
print_red(relation)
continue
for i in ['to', 'from']:
if 'name' not in interrogation:
interrogation['name'] = interrogation['person']
db.collection('interrogations').update(interrogation, check_rev=False)
if relation[i] == interrogation['name']:
relation[i] = interrogation['person_id']
elif relation[i] == interrogation['person_mentioned_as']:
relation[i] = interrogation['person_id']
for k, v in relation.items():
print_rainbow(k, v)
print()
for k, v in interrogation.items():
if k == 'text' or k == 'formatted_text':
continue
print_rainbow(k, v)
db.collection('relations').update(relation, check_rev=False)

@ -1,95 +1,261 @@
from _chroma import ChromaDB from _chroma import chroma
from _arango import arango from _arango import arango, db
from _llm import LLM from _llm import LLM
from pprint import pprint
from print_color import * from print_color import *
import multiprocessing import multiprocessing
from typing import Union
import difflib
class Person:
def __init__(self):
self.info = None
self.summary = None
def find_person(person): def make_summary(self):
llm = LLM(chat=False, small=True)
if len(self.info) > 100:
summary_prompt = f"""Nedan är olika bitar med information om en person:\n
{self.info}\n\nSammanfatta dessa ett detaljerat sätt. Var noga med namn, platser, händelser och relationer.
Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat. """
else:
summary_prompt = f"""Nedan är information om en person:\n
{self.info}\n\nSammanfatta denna information detaljerat som möjligt. Var noga med namn, platser, händelser och relationer.
Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat. """
self.summary = llm.generate(summary_prompt)
class UnverifiedPerson(Person):
def __init__(self, doc: dict, interrogation:str=None):
super().__init__()
self.doc = doc
for k, v in self.doc.items():
setattr(self, k, v)
if 'info' in doc:
self.info = "\n".join(doc["info"])
else:
self.info = None
if 'name' in doc:
self.name = doc["name"]
else:
self.name = ''
class FoundPerson(Person):
"""
Represents a person found in ArangoDB.
Attributes:
name (str): The name of the person.
info (str): Additional information about the person.
key (str): A unique identifier for the person.
doc (str): The persons document in ArangoDB.
summary (str): A summary of the person's details.
""" """
Finds a person in the database based on the given person information.
def __init__(self, db, name, key):
super().__init__()
self.name = name
self.key = key
self.doc = db.collection("persons").get(key)
self.info = "\n".join(self.doc["info"])
self.summary = self.make_summary()
def check_name(person, answer_person, text):
print_yellow(person, " - ", answer_person)
same = False
# If person only has one name, first or last, compare that to first and last name of answer_person
print('Length person:', len(person.strip().split()))
if len(person.strip().split()) == 1:
llm = LLM()
answer_first_name = answer_person.split()[0].strip()
answer_last_name = answer_person.split()[-1].strip()
first_name_similarity = difflib.SequenceMatcher(
None, person, answer_first_name
).ratio()
last_name_similarity = difflib.SequenceMatcher(
None, person, answer_last_name
).ratio()
print("First name similarity:", first_name_similarity)
print("Last name similarity:", last_name_similarity)
if difflib.SequenceMatcher(None, person, answer_first_name).ratio() > 0.9:
if answer_last_name in text:
same = True
else:
# Count how many time the first name appears in the first_names list
first_names = [
i["name"].split()[0] for i in db.collection("persons").all()
]
first_name_count = first_names.count(answer_first_name)
print("First name count:", first_name_count)
if first_name_count == 1:
same = True
else:
llm = LLM(small=True)
answer = llm.generate(
f'Nämns någon med efternamnet "{answer_last_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"'
)
if "JA" in answer:
same = True
elif difflib.SequenceMatcher(None, person, answer_last_name).ratio() > 0.9:
if answer_first_name in text:
same = True
else:
llm = LLM(small=True)
answer = llm.generate(
f'Nämns någon med förnamnet "{answer_first_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"'
)
if "JA" in answer:
same = True
else:
name_similarity = difflib.SequenceMatcher(None, person, answer_person).ratio()
print("Similarity:", name_similarity)
if name_similarity > 0.85:
same = True
return same
def find_with_llm(unverified_person: UnverifiedPerson):
unverified_person.make_summary()
def find_person(
unverified_person: Union[dict, UnverifiedPerson, str] = None,
name: str = None,
key: str = None,
):
"""
Finds a person in the Chroma db.
Args: Args:
person (dict): A dictionary containing information about the person. unverified_person (Union[dict, UnverifiedPerson]): The unverified person to idetify.
Returns: Returns:
list: A list of tuples containing the following information: FoundPerson: The found person
- generated answer (str): The generated answer from the language model.
- person information (dict): Information about the matched person in the database.
- interrogation document (dict): The document containing the interrogation text.
- mentioned person name (str): The name of the person mentioned in the interrogation.
- matched person name (str): The name of the person matched in the database.
- original person information (dict): The original information about the person.
""" """
db = arango.db
llm = LLM()
other_person = person["name"] if not isinstance(unverified_person, UnverifiedPerson):
if unverified_person is None:
unverified_person = {}
if name:
unverified_person['name'] = name
if key:
unverified_person['_key'] = key
unverified_person = UnverifiedPerson(unverified_person)
chroma = ChromaDB() if "is_not" in unverified_person.doc:
col = chroma.client.get_or_create_collection("mala_persons") list_filter_isnot = [unverified_person.name].append(
if "is_not" not in other_person: unverified_person.doc["is_not"]
filter_isnot = {} )
else: else:
filter_isnot = {"name": {"$nin": other_person["is_not"]}} list_filter_isnot = [unverified_person.name]
filter_isnot = {"name": {"$nin": list_filter_isnot}}
# Do a query to find the person query_results = chroma.query(
hits = col.query(query_texts=[other_person], n_results=1, where=filter_isnot) query_texts=[unverified_person.name],
n_results=1,
where=filter_isnot,
collection="mala_persons",
)
found_person = hits["documents"][0][0] distance = query_results["distances"][0][0]
found_person_key = hits["metadatas"][0][0]["_key"] print_purple(query_results["metadatas"][0][0]["name"], distance)
distance = hits["distances"][0][0]
# * Filter out hits with distance > 1
if distance > 1: if distance > 1:
return [] unverified_person.make_summary()
found_person_in_arango = db.collection("persons").get(found_person_key) query_results = chroma.query(
found_person_info = "\n".join(found_person_in_arango["info"]) query_texts=[unverified_person.summary],
n_results=1,
where=filter_isnot,
collection="mala_persons_info",
)
distance = query_results["distances"][0][0]
print_yellow(query_results["metadatas"][0][0]["name"], distance)
if distance > 1:
return None
# return unverified_person, found_person, False
print_red("NAME", query_results["documents"][0][0])
found_person = FoundPerson(
db,
name=query_results["metadatas"][0][0]["name"],
key=query_results["metadatas"][0][0]["_key"],
)
return found_person
prompt = f"Nedan är olika bitar med information om en person:\n\n{found_person_info}\n\nSammanfatta dessa på ett detaljerat sätt, var noga med namn, platser, händelser och relationer. Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat. "
person_in_arango_summary = llm.generate(prompt)
# Write summary about the person
interrogations = person["mentioned_in_interrogation"] def identify(unverified_person: Union[dict, UnverifiedPerson]):
"""
Finds and summarizes a person based on the provided person document.
Args:
person_doc (dict): The person document containing information about the person.
Returns:
dict: A dictionary containing the following keys:
- "unverified_person": An instance of the UnverifiedPerson class representing the unverified person.
- "found_person": An instance of the FoundPerson class representing the found person.
- "suggestions": A list of tuples containing suggestions and interrogation IDs.
"""
llm = LLM(small=True)
output = [] if not isinstance(unverified_person, UnverifiedPerson):
for interrogation in interrogations: unverified_person = UnverifiedPerson(unverified_person)
interrogation_doc = db.collection("interrogations").get(interrogation) found_person = find_person(unverified_person)
text = interrogation_doc["text"]
prompt = f'''I texten nedan omnämns en "{other_person}" och jag försöker förstå om det kan vara exempelvis ett felstavat namn eller smeknamn för en annan person.\n if not found_person:
return {
"unverified_person": unverified_person,
"found_person": None,
"suggestions": [
(None, i) for i in unverified_person.doc["mentioned_in_interrogation"]
],
}
suggestions = []
for interrogation_id in unverified_person.doc["mentioned_in_interrogation"]:
interrogation_data = db.collection("interrogations").get(interrogation_id)
text = interrogation_data["text"]
answer_prompt = f'''I texten nedan omnämns en "{unverified_person.name}" och jag försöker förstå om det kan vara exempelvis ett felstavat namn eller smeknamn för en annan person.\n
TEXT: TEXT:
"""{text}"""\n """{text}"""\n
andra ställen i polisens förundersökning finns en person som heter "{found_person}", och som beskrivs här: andra ställen i polisens förundersökning finns en person som heter "{found_person.name}", och som beskrivs här:
"""{person_in_arango_summary}"""\n """{found_person.summary}"""\n
Verkar det troligt att personen som kallas {other_person} är samma person som {found_person}? Svara bara JA eller NEJ, samt en kort förklaring till varför. Verkar det troligt att personen som kallas {unverified_person.name} är samma person som {found_person.name}? Svara bara JA eller NEJ, samt en kort förklaring till varför.
''' '''
# Om istället förnamnet eller efternamnet är helt olika så är det förmodligen inte samma person.Om det bara är ett namn (inget efternamn) kan det också handla om ett smeknamn eller en beskrivning. answer = llm.generate(answer_prompt)
answer = llm.generate(prompt) suggestions.append((answer, interrogation_data))
output.append(
(
answer,
found_person_in_arango,
interrogation_doc,
other_person,
found_person,
found_person_info,
person,
)
)
return output return {
"unverified_person": unverified_person,
"found_person": found_person,
"suggestions": suggestions,
}
def verify( def verify(
db, db,
answer=None, answer=None,
person=None, unverified_person=None,
person_in_arango=None, found_person=None,
interrogation_key=None, interrogation_key=None,
): ):
""" """
@ -109,40 +275,46 @@ def verify(
print_blue("Answer:", answer) print_blue("Answer:", answer)
# If the answer is Yes # If the answer is Yes
if answer == "Yes": if answer == "Yes":
person["mentioned_in_interrogation"].remove(interrogation_key) unverified_person.doc["mentioned_in_interrogation"].remove(interrogation_key)
person_in_arango["confirmed"] = True db.collection("persons").update(unverified_person.doc)
db.collection("persons").update(person)
person_in_arango["info"] += person["info"]
person_in_arango["mentioned_in_interrogation"] += ["mentioned_in_interrogation"]
from pprint import pprint found_person.doc["confirmed"] = True
found_person.doc["info"] += found_person.doc["info"]
found_person.doc["mentioned_in_interrogation"] += ["mentioned_in_interrogation"]
print("Updated person in arango:") print("Updated person in arango:")
pprint( print_green(
db.collection("persons").insert(person_in_arango, overwrite_mode="update") db.collection("persons").insert(found_person.doc, overwrite_mode="update")
) )
if person["mentioned_in_interrogation"] == [] and person['_key'] != person_in_arango['_key']: if (
db.collection("other_persons").insert(person, overwrite=True) unverified_person.doc["mentioned_in_interrogation"] == []
db.collection("persons").delete(person, check_rev=False) and unverified_person.doc["_key"] != found_person.doc["_key"]
print(f"Removed {person}") ):
db.collection("other_persons").insert(
unverified_person.doc, overwrite_mode="update"
)
db.collection("persons").delete(unverified_person.doc, check_rev=False)
print_red(f"Removed {unverified_person.doc}")
# If the answer is No # If the answer is No
if answer == "No": if answer == "No":
if "is_not" not in person: if "is_not" not in unverified_person.doc:
person["is_not"] = [] unverified_person.doc["is_not"] = []
person["is_not"].append([person_in_arango["name"]]) unverified_person.doc["is_not"].append([found_person.doc["name"]])
db.collection("persons").update(person, merge=True, check_rev=False) db.collection("persons").update(
unverified_person.doc, merge=True, check_rev=False
)
# If the answer is Unknown # If the answer is Unknown
if answer == "Unknown": if answer == "Unknown":
db.collection("unknown").insert( db.collection("unknown").insert(
{"name": person, "interrogation": interrogation_key}, overwrite=True {"name": unverified_person.name, "interrogation": interrogation_key},
overwrite=True,
) )
if __name__ == "__main__": if __name__ == "__main__":
db = arango.db
persons = list(db.collection("persons").all()) persons = list(db.collection("persons").all())
q = "for doc in persons filter doc.other == true return doc" q = "for doc in persons filter doc.other == true return doc"

@ -0,0 +1,34 @@
from _llm import LLM
from _arango import db
from _chroma import chroma
from print_color import *
from identify_person import find_person
llm = LLM(small=True)
def check_from(relations):
for relation in relations:
interrogation = db.collection('interrogations').get(relation['interrogation'])
text = f"Hörd person: {interrogation['person']}\n{interrogation['text']}"
prompt = f"""Är "{relation['from']}" personen som förhörs i texten nedan?\n\n{text[:2000]}\n\nSvara enbart JA eller NEJ."""
answer = llm.generate(prompt)
if 'JA' in answer:
relation['from_key'] = interrogation['person_id']
db.collection('relations').update(relation, check_rev=False)
print_rainbow(relation['from'], interrogation['person'], answer)
q = "for doc in relations filter doc.from_key == null limit 10 return doc" #! Limit 10
relations = list(db.aql.execute(q))
for relation in relations:
desc = ''
for r in relation['relations']:
desc += r['description'] + '\n'
desc = desc.strip()
print_green(relation['to'])
print(find_person(name=relation['to']))
print()

@ -0,0 +1,423 @@
from _chroma import chroma
from _arango import arango, db
from _llm import LLM
from print_color import *
import difflib
import re
from langchain_text_splitters import CharacterTextSplitter
# text_splitter = CharacterTextSplitter(
# separator="\n\n",
# chunk_size=8000,
# chunk_overlap=0,
# length_function=len,
# is_separator_regex=False,
# )
class Person:
def __init__(self):
self.info = None
self.summary = None
def make_summary(self):
llm = LLM(chat=False, system_prompt="Du sammanfattar information om en person utifrån ett polisförhör. Sammanfattningen ska sedan användas för att göra en sökning i en vektordatabas.")
info = self.info
if not self.info or all([len(self.info) < 200, 'interrogation_key' in self.doc, 'name' in self.doc]):
interrogation_text = db.collection("interrogations").get(self.doc['interrogation_key'])['text']
if len(interrogation_text) > 20000:
if self.doc['name'] in interrogation_text:
index = interrogation_text.find(self.doc['name'])
if index < 1000:
interrogation_text = interrogation_text[:8000]
else:
interrogation_text = interrogation_text[index-1000:][:8000]
prompt = f"""Nedan är ett polisförhör:\n
{interrogation_text}\n
Jag är intresserad av en person som omnämns som "{self.doc['name']}". Gör en detaljerad sammanfattning av informationen om {self.name}. Var noga med relationer, namn och platser. Svara ENBART med informationen om personen, ingenting annat. Svara alltid svenska!"""
info = llm.generate(prompt)
if self.info:
info = self.info + "\n" + info
print_rainbow(f'Info about: {self.name}', info)
summary_prompt = f""""Nedan är olika bitar med information om en person:\n
{info}\n
Sammanfatta dessa ett detaljerat sätt. Var noga med namn, platser, händelser och relationer.
Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat."""
self.summary = llm.generate(summary_prompt)
class UnknownPerson(Person):
def __init__(self, doc: dict):
super().__init__()
self.doc: dict = doc
for k, v in self.doc.items():
setattr(self, k, v)
if "info" in doc:
self.info = "\n".join(doc["info"])
else:
self.info = None
if "name" in doc:
self.name = doc["name"]
else:
self.name = ""
class FoundPerson(Person):
"""
Represents a person found in ArangoDB.
Attributes:
name (str): The name of the person.
info (str): Additional information about the person.
key (str): A unique identifier for the person.
doc (str): The persons document in ArangoDB.
summary (str): A summary of the person's details.
"""
def __init__(self, db, name, key):
super().__init__()
self.name = name
self.key = key
self.doc = db.collection("persons").get(key)
self.info = "\n".join(self.doc["info"])
class PersonIdentifier:
def __init__(
self,
doc: dict = None,
name: str = None,
key: str = None,
person: UnknownPerson = None,
interrogation_key: str=None,
text: str=None
):
self.doc: dict = doc
self.name: str = name
if 'name' in doc:
self.name = doc['name']
self.key: str = key
if '_key' in doc:
self.key = doc['_key']
self.unknown_person: UnknownPerson = None
self.found_person: FoundPerson = None
self.suggestions = None
self.interrogation_key = interrogation_key
self.text = text
self.get_unknown_person(doc, name, key, person)
def get_unknown_person(self, doc, name, key, person):
"""Get the unknown person."""
self.unknown_person = None
self.found_person = None
# Set the unknown person
if person:
self.unknown_person = person
elif doc:
self.unknown_person = UnknownPerson(doc)
elif key and db.collection("persons").get(key):
self.unknown_person = UnknownPerson(db.collection("persons").get(key))
else:
assert key or name, "Both key and name are missing."
self.unknown_person = UnknownPerson(
{k: v for k, v in [("name", name), ("_key", key)] if v}
)
def check_name(self, text):
"""Check if it's likely that person and answer_person are the same person."""
print_yellow(self.unknown_person.name, " - ", self.found_person.name)
same = False
# If person only has one name, first or last, compare that to first and last name of answer_person
if len(self.unknown_person.name.strip().split()) == 1:
llm = LLM()
answer_first_name = self.found_person.name.split()[0].strip()
answer_last_name = self.found_person.name.split()[-1].strip()
if (
difflib.SequenceMatcher(
None, self.unknown_person.name, answer_first_name
).ratio()
> 0.9
):
if answer_last_name in text:
same = True
else:
# Count how many time the first name appears in the first_names list
first_names = [
i["name"].split()[0] for i in db.collection("persons").all()
]
first_name_count = first_names.count(answer_first_name)
if first_name_count == 1:
same = True
else:
llm = LLM(small=True)
answer = llm.generate(
f'Nämns någon med efternamnet "{answer_last_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"'
)
if "JA" in answer:
same = True
elif (
difflib.SequenceMatcher(
None, self.unknown_person.name, answer_last_name
).ratio()
> 0.9
):
if answer_first_name in text:
same = True
else:
llm = LLM(small=True)
answer = llm.generate(
f'Nämns någon med förnamnet "{answer_first_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"'
)
if "JA" in answer:
same = True
else:
name_similarity = difflib.SequenceMatcher(
None, self.unknown_person.name, self.found_person.name
).ratio()
if name_similarity > 0.85:
same = True
return same
def find_with_llm(self):
if not self.unknown_person.summary:
self.unknown_person.make_summary()
llm = LLM(chat=True, system_prompt="Du hjälper till att ta reda på vad en person heter. Först skapar du meningar som ska användas för att söka i en vektordatabas, sedan använder du informationen du får där till att ta reda på vad personen heter. Svara alltid på svenska.")
print_rainbow('Info bites:', self.unknown_person.summary)
info_bites = llm.generate(f"Nedan är olika bitar med information om en person:\n\n {self.unknown_person.summary} \n\Dela upp den i 3-4 meningar där varje mening beskriver en specifik detalj om personen. Svara med en mening per rad. Svara ENBART med informationen om personen, ingenting annat.")
querys = info_bites.split("\n")
print_rainbow('Querys:', querys)
chroma_docs = chroma.query(
query_texts=querys,
n_results=3,
collection="mala_interrogations",
)
info = ''
for answer in chroma_docs['documents']:
for doc in answer:
print_blue(doc)
info += doc + "\n"
prompt = f'''Nedan är en text där {self.name} nämns:\n\n{self.text}\n\nJag vill veta vem "{self.unknown_person.name}" är. Läs texten nedan för att se om du kan hitta personens fulla namn:\n
{info}\n
Vad heter "{self.unknown_person.name}"? Svara med förnamn och efternamn formen "Förnamn Efternamn". Svara "None" om det inte går att säga utifrån informationen.'''
print_yellow('Längd på info:', len(info))
print_rainbow('Prompt', prompt)
answer = llm.generate(prompt)
print_green(answer)
def find_person(self):
"""Finds a person in the Chroma db."""
if "is_not" in self.unknown_person.doc:
list_filter_isnot = [self.unknown_person.name].append(
self.unknown_person.doc["is_not"]
)
else:
list_filter_isnot = [self.unknown_person.name]
filter_isnot = {"name": {"$nin": list_filter_isnot}}
query_results = chroma.query(
query_texts=[self.unknown_person.name],
n_results=1,
where=filter_isnot,
collection="mala_persons",
)
distance = query_results["distances"][0][0]
print_purple(query_results["metadatas"][0][0]["name"], distance)
if distance > 1: #! This is not really working...
self.unknown_person.make_summary()
query_results = chroma.query(
query_texts=[self.unknown_person.summary],
n_results=1,
where=filter_isnot,
collection="mala_persons_info",
)
distance = query_results["distances"][0][0]
print_yellow(query_results["metadatas"][0][0]["name"], distance)
if distance > 1:
return None
# return unknown_person, found_person, False
print_blue("Name found peson:", query_results["documents"][0][0])
found_person = FoundPerson(
db,
name=query_results["metadatas"][0][0]["name"],
key=query_results["metadatas"][0][0]["_key"],
)
return found_person
def identify(self):
llm = LLM(small=True)
self.found_person = self.find_person(self.unknown_person)
if not self.found_person:
self.suggestions = [
(None, i) for i in self.unknown_person.doc["mentioned_in_interrogation"]
]
# Summarize the found persons info
self.found_person.make_summary()
suggestions = []
for interrogation_id in self.unknown_person.doc["mentioned_in_interrogation"]:
interrogation_data = db.collection("interrogations").get(interrogation_id)
text = interrogation_data["text"]
answer_prompt = f'''I texten nedan omnämns en "{self.unknown_person.name}" och jag försöker förstå om det kan vara exempelvis ett felstavat namn eller smeknamn för en annan person.\n
TEXT:
"""{text}"""\n
andra ställen i polisens förundersökning finns en person som heter "{self.found_person.name}", och som beskrivs här:
"""{self.found_person.summary}"""\n
Verkar det troligt att personen som kallas {self.unknown_person.name} är samma person som {self.found_person.name}? Svara bara JA eller NEJ, samt en kort förklaring till varför.
'''
answer = llm.generate(answer_prompt)
suggestions.append((answer, interrogation_data))
self.suggestions = suggestions
def verify(
db,
answer=None,
unknown_person=None,
found_person=None,
interrogation_key=None,
):
"""
Verifies the answer for a person's identification in an interrogation.
Args:
db: The database object.
answer (str): The answer for the person's identification. Can be "Yes", "No", or "Unknown".
person (dict): The person's information.
person_in_arango (dict): The person's information in ArangoDB.
text (str): The text mentioning the person in the interrogation.
interrogation_key (str): The key identifying the interrogation.
Returns:
None
"""
print_blue("Answer:", answer)
# If the answer is Yes
if answer == "Yes":
unknown_person.doc["mentioned_in_interrogation"].remove(interrogation_key)
db.collection("persons").update(unknown_person.doc)
found_person.doc["confirmed"] = True
found_person.doc["info"] += found_person.doc["info"]
found_person.doc["mentioned_in_interrogation"] += [
"mentioned_in_interrogation"
]
print("Updated person in arango:")
print_green(
db.collection("persons").insert(
found_person.doc, overwrite_mode="update"
)
)
if (
unknown_person.doc["mentioned_in_interrogation"] == []
and unknown_person.doc["_key"] != found_person.doc["_key"]
):
db.collection("other_persons").insert(
unknown_person.doc, overwrite_mode="update"
)
db.collection("persons").delete(unknown_person.doc, check_rev=False)
print_red(f"Removed {unknown_person.doc}")
# If the answer is No
if answer == "No":
if "is_not" not in unknown_person.doc:
unknown_person.doc["is_not"] = []
unknown_person.doc["is_not"].append([found_person.doc["name"]])
db.collection("persons").update(
unknown_person.doc, merge=True, check_rev=False
)
# If the answer is Unknown
if answer == "Unknown":
db.collection("unknown").insert(
{"name": unknown_person.name, "interrogation": interrogation_key},
overwrite=True,
)
class PersonFinder:
def __init__(
self,
names={},
chunk_size=5000,
chunk_overlap=0,
separator="\n\n",
):
self.names = names
self.llm = LLM(
chat=False,
small=True,
system_prompt="Du är en assistent som hjälper till att hitta personer i ett polisförhör. Svara bara när personen finns i den del du får, hitta inte på personer.",
)
self.text_splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
is_separator_regex=False,
)
def extract_names(self, chunk, extra_prompt=""):
chunk_names = []
# Find persons in the text
prompt = f'''Jag vill hitta alla personer som nämns i texten nedan:\n
"""{chunk}"""\n
Vilka personer nämns i texten? Svara ENBART med en pythonformaterad lista av namn.
Exempel svar för att du ska förstå formen:
<exempel>
[namn1, namn2, namn3].
</exempel
Var noga med att svara
{extra_prompt}'''
response = self.llm.generate(prompt)
response = re.sub(r"[^a-zA-ZåäöÅÄÖ\- ,]", "", response).replace(" namn ", "")
for name in [i.strip() for i in response.split(",") if len(i) > 2]:
same_name = False
if name not in chunk_names and name not in self.names:
if self.names != []:
for n in list(self.names):
if name in n:
same_name = True
self.names[name] = self.names[n]
if not same_name:
chunk_names.append(name)
return chunk_names
if __name__ == "__main__":
text = db.collection('rumors').get('Mikael_Sjostrom_2023-02-13_p.98')
person = PersonIdentifier(
doc={'name': 'Douglas', 'interrogation_key': "_'Larsson',_'_Neo'__2023-02-15_p.208"})
person.find_with_llm()

@ -1,11 +0,0 @@
from _arango import arango
from _llm import LLM
llm = LLM(keep_alive=6000, chat=False)
q = 'for doc in interrogations filter doc.reason != null return doc'
docs = [i for i in arango.db.aql.execute(q)]
for doc in docs:
print("\033[92m", doc['person'], "\033[0m", doc['reason'])

@ -1,23 +1,58 @@
from random import choice
def print_green(*args): def print_green(*args):
text = '' text = ""
for arg in args: for arg in args:
text += str(arg) + ' ' text += str(arg) + " "
print(f"\033[92m{text}\033[0m") print(f"\033[92m{text}\033[0m")
def print_red(*args): def print_red(*args):
text = '' text = ""
for arg in args: for arg in args:
text += str(arg) + ' ' text += str(arg) + " "
print(f"\033[91m{text}\033[0m") print(f"\033[91m{text}\033[0m")
def print_yellow(*args): def print_yellow(*args):
text = '' text = ""
for arg in args: for arg in args:
text += str(arg) + ' ' text += str(arg) + " "
print(f"\033[93m{text}\033[0m") print(f"\033[93m{text}\033[0m")
def print_blue(*args): def print_blue(*args):
text = '' text = ""
for arg in args: for arg in args:
text += str(arg) + ' ' text += str(arg) + " "
print(f"\033[94m{text}\033[0m") print(f"\033[94m{text}\033[0m")
def print_purple(*args):
text = ""
for arg in args:
text += str(arg) + " "
print(f"\033[95m{text}\033[0m")
def choose_color(last_color_index):
colors = {
"blue": "\033[94m",
"green": "\033[92m",
"yellow": "\033[93m",
"red": "\033[91m",
"purple": "\033[95m",
}
color_keys = list(colors.keys())
color_index = (last_color_index + 1) % len(color_keys)
color = color_keys[color_index]
return colors[color], color, color_index
def print_rainbow(*args):
color_index = -1
text = ""
for arg in args:
color_code, color, color_index = choose_color(color_index)
text += f"{color_code}{arg}\033[0m "
print(text)

@ -0,0 +1,15 @@
from _llm import LLM
from _arango import arango
from print_color import *
llm = LLM(chat=False)
interrogations = list(arango.db.collection("interrogations").all())
for interrogation in interrogations:
text = interrogation['text']
prompt = f'Vad sägs om lördagskvällen i texten nedan? \n\n"""{text}""" Jag vill veta vad som sägs i texten om lördagskvällen. Var noga med prsonre, namn och platser.'
answer = llm.generate(prompt)
print_blue(interrogation['person'])
print(answer, '\n')

@ -0,0 +1,94 @@
from _chroma import ChromaDB
from _openai import LLM_OpenAI as LLM
import streamlit as st
from print_color import *
def get_docs(user_input):
docs = chroma.query('mala_interrogations', user_input, n_results=5)
return docs
def generate_prompt(user_input, docs):
texts = [text for text in docs['documents'][0]]
metas = [{'person': meta['person'], 'date': meta['date']} for meta in docs['metadatas'][0]]
combined_data = list(zip(texts, metas))
string = ''
for text, meta in combined_data:
# Do something with text and meta
# For example, print them
string += f'\n\nFrån förhör med {meta["person"]} {meta["date"]}:'.upper()
string += f'\n{text}\n\n'
prompt = f'''Svara på frågan: {user_input}\n
Använd endast informationen nedan:\n
{string}\n
Skriv utförligt svenska och var noga med detaljer som namn, plats och datum.
Får gärna med information från alla fem förhör om det är relevant.\n
{user_input}'''
answer = st.session_state.llm.generate(prompt)
return prompt
st.set_page_config(
page_title="Malå",
)
# Should not be reseted every run.
if "llm" not in st.session_state:
st.session_state.llm = LLM(chat=True, system_prompt='Du är assistent åt en journalist som går igenom förhör i en förundersökning. Svara bara utifrån den information du får. Svara alltid på svenska!')
# Should be reset every run.
llm_checker = LLM(chat=True)
chroma = ChromaDB()
# Initialize chat history
if "messages" not in st.session_state:
st.session_state.messages = []
# Display chat messages from history on app rerun
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
# Accept user input
if user_input := st.chat_input("Fråga något om förhören."):
print_blue(user_input)
if len(st.session_state.messages) > 1:
history = ''
for message in st.session_state.messages:
history += f"{message['role']}: {message['content']}\n'"
prompt = f'En användare har ställt frågan "{user_input}" och här är chatthistoriken mellan användaren och en assistent:\n{history}\n\nVerkar "{user_input}" vara en uppföljningfråga eller en fristående fråga? Svara ENDAST med "uppföljning" eller "fristående".'
chat_completion = llm_checker.generate(prompt, stream=False)
answer = chat_completion.choices[0].message.content
print_red(answer)
if 'uppföljning' in answer:
prompt=f'Använd historiken till att omformulera "{user_input}" till en helt fristående fråga. Frågan ska användas för att hitta information i förhören.'
chat_completion = llm_checker.generate(prompt, stream=False)
question2chroma = chat_completion.choices[0].message.content
if 'fristående' in answer:
question2chroma = user_input
if 'None' in answer:
question2chroma = user_input
print_yellow(question2chroma)
else:
question2chroma = user_input
# Add user message to chat history
st.session_state.messages.append({"role": "user", "content": user_input})
# Display user message in chat message container
with st.chat_message("user"):
st.markdown(user_input)
# Display assistant response in chat message container
with st.chat_message("assistant"):
docs = get_docs(question2chroma)
prompt = generate_prompt(user_input, docs)
stream = st.session_state.llm.generate(prompt)
response = st.write_stream(stream)
st.session_state.llm.messages.append({'role': 'assistant', 'content': response})
st.session_state.messages.append({"role": "assistant", "content": response})
print()
Loading…
Cancel
Save