Refactor print_color functions to accept multiple arguments

main
lasseedfast 2 years ago
parent b5ad23f652
commit 744b1f02f2
  1. 7
      Malå.py
  2. 335
      Verifiera_namn_.py
  3. 47
      _arango.py
  4. 95
      _chroma.py
  5. 88
      _llm.py
  6. 76
      _openai.py
  7. 91
      app.py
  8. 43
      app_test.py
  9. 72
      arango_admin.py
  10. 150
      extract_fup.py
  11. 252
      extract_persons.py
  12. 87
      extract_relations.py
  13. 25
      extract_rumors.py
  14. 26
      fix_relations.py
  15. 328
      identify_person.py
  16. 34
      identify_persons_in_relations.py
  17. 423
      person_identifier.py
  18. 11
      persons.py
  19. 53
      print_color.py
  20. 15
      saturday.py
  21. 94
      testopenai_chat.py

@ -3,4 +3,9 @@ import streamlit as st
st.set_page_config(
page_title="Malå",
)
)
st.markdown('##### Välj något av alternativen till vänster.')
st.markdown('**Fråga om förhör** är en chatt där du kan fråga om förhör.')
st.markdown('**Personer** är en lista över personer i utredningen.')
st.markdown('**Verifiera namn** är en sida där du kan verifiera overifierade namn.')

@ -0,0 +1,335 @@
import streamlit as st
from identify_person import identify, verify, find_person, UnverifiedPerson, FoundPerson
from _arango import arango
import re
from fuzzywuzzy import process
from _llm import LLM as LLM_garda
from _openai import LLM_OpenAI as LLM
from print_color import *
from random import randint
# from print_color import *
print("Start")
def reset_choices():
st.session_state.user_choice = None
st.session_state.unconfirmed_choice = None
st.session_state.custom_choice = None
def check_if_dict_in_list(target_dict, list_of_dicts):
target_key, target_value = list(target_dict.items())[0]
for dict_ in list_of_dicts:
key, value = list(dict_.items())[0]
if key == target_key and value == target_value:
return True
return False
def submitted():
st.session_state.next = True
@st.cache_data()
def sort_names_by_similarity(target_name, name_list):
# Get a list of tuples (name, score)
scored_names = process.extract(target_name, name_list, limit=len(name_list))
# Sort the list of tuples by score in descending order
scored_names.sort(key=lambda x: x[1], reverse=True)
# Extract the sorted list of names
sorted_names = [name for name, score in scored_names]
return sorted_names
@st.cache_data()
def get_persons():
return list(arango.db.collection("persons").all())
@st.cache_data()
def get_unverified_persons():
q = "for doc in persons filter doc.confirmed != true return doc"
return [i for i in db.aql.execute(q)]
@st.cache_data()
def get_suggestions(person):
suggestion = identify(person)
st.session_state.unverified_person = suggestion["unverified_person"]
st.session_state.found_person = suggestion["found_person"]
st.session_state.suggestions = suggestion["suggestions"]
def caps(string):
if "*" not in string:
string = string.upper()
else:
string = string.replace("*", "")
return string
def get_unverified_person():
try:
n = randint(0, len(st.session_state.unverified_persons) - 1)
doc = st.session_state.unverified_persons.pop(n)
st.session_state.unverified_person = UnverifiedPerson(doc)
except ValueError:
st.markdown(":green[Inga fler personer att verifiera.]")
st.stop()
st.set_page_config(
page_title="Malå",
)
# Get URL parameters
params = st.query_params
param_person_key = params.get("person_key", None)
db = arango.db
# Add a session state to store the persons and unconfirmed persons etc
if "next" not in st.session_state:
st.session_state.next = False
if "persons" not in st.session_state:
st.session_state.persons = get_persons()
all_persons_name_list = []
for person in st.session_state.persons:
name = person["name"]
if not person["confirmed"]:
name += "*"
all_persons_name_list.append(name)
st.session_state.persons_names = all_persons_name_list
st.session_state.persons_dict = {
i["name"]: i["_key"] for i in st.session_state.persons
}
if "unverified_persons" not in st.session_state:
if param_person_key:
# If a person key is provided in the URL, only show that person
st.session_state.unverified_persons = list(
db.aql.execute(
"for doc in persons filter doc._key == @key return doc",
bind_vars={"key": param_person_key},
)
)
print_blue("param_person_key".upper(), st.session_state.unverified_persons)
else:
st.session_state.unverified_persons = get_unverified_persons()
if "persons_names" not in st.session_state:
st.session_state.persons_names = arango.get_persons(confirmed=False)["names"]
if "user_choice" not in st.session_state:
st.session_state.user_choice = None
if "unconfirmed_choice" not in st.session_state:
st.session_state.unconfirmed_choice = None
if "custom_choice" not in st.session_state:
st.session_state.custom_choice = None
if (
"unverified_person" not in st.session_state
or not st.session_state.unverified_persons
):
get_unverified_person()
if "found_person" not in st.session_state:
st.session_state.found_person = None
if "suggestions" not in st.session_state:
# Set new values for unverified_person, found_person and suggestions (as session_state)
get_suggestions(st.session_state.unverified_person.__dict__)
print_yellow("SUGGESTIONS", st.session_state.suggestions)
if "suggestion" not in st.session_state:
st.session_state.suggestion = st.session_state.suggestions.pop(0)
# Get unconfirmed person, found person and answer from the suggestions
unverified_person: UnverifiedPerson = st.session_state.unverified_person
found_person: FoundPerson = st.session_state.found_person
answer = st.session_state.suggestion[0]
interrogation_doc = st.session_state.suggestion[1]
if isinstance(interrogation_doc, str):
interrogation_doc = db.collection("interrogations").get(interrogation_doc)
text = interrogation_doc["text"]
st.markdown(
f'Namnet **"{st.session_state.unverified_person.name}"** används i **{len(st.session_state.unverified_person.mentioned_in_interrogation)}** förhör. Namnet kan syfta på olika personer i olika sammanhang så vi går igenom förhören ett och ett.'
)
if answer:
answer = answer.replace("\n", " ")
st.markdown(
f"Är :blue[{unverified_person.name}] samma som :blue[{found_person.name}]?"
)
print(found_person.__dict__)
st.write(f'(från förhör med {interrogation_doc["person"]})')
if "JA" in answer:
st.markdown(f"🤖\n:green[{answer.replace('JA ', '')}]")
radio_index = 0
elif "NEJ" in answer:
radio_index = 1
st.markdown(f"🤖\n:red[{answer.replace('NEJ ', '')}]")
else:
radio_index = None
st.markdown(f"🤖\n{answer}")
# Let the user expand for more info
else:
st.markdown(f"Vem är :blue[{unverified_person.name}]?")
st.write(f'(från förhör med {interrogation_doc["person"]})')
# Show the information about the suggested person
# Edit and show the interrogation text
with st.expander(f"Mer information om förhöret"):
text = text.replace("\nFL:", "<br>**FL:** ").replace("\nDH:", "<br>**DH:** ")
text = re.sub(r"\n(?!\n)", "", text)
text = re.sub(r"\n\n+", "\n", text)
text = text.replace("\n", "<br>").replace(
unverified_person.name, f"**:red[{unverified_person.name}]**"
)
st.markdown(f"##### Förhöret:\n{text}", unsafe_allow_html=True)
# A form to let the user select an alternative
with st.form("select_alternative"):
if answer:
# Let the user decide if the suggested person is the same as the unconfirmed person
st.session_state.user_choice = st.radio(
"Select alternative",
("Ja", "Nej", "Vet ej"),
key="user_choice_radio",
index=radio_index,
)
else:
st.session_state.user_choice = None
# Let the user select an alternative person
alternatives = sort_names_by_similarity(
unverified_person.name, st.session_state.persons_names
)
st.session_state.unconfirmed_choice = st.selectbox(
"Välj någon som stämmer",
alternatives,
placeholder="Sök en annan",
index=None,
key="multiselect",
format_func=caps,
help="Personer i caps är bekräftade personer, välj någon av dem om det verkar stämma.",
)
# Let the user enter a custom alternative
st.session_state.custom_choice = st.text_input(
"Annan person",
key="custom",
help="Skriv in namnet på personen om det inte finns i listan. Var noga med stavningen.",
)
st.session_state.custom_choice = (
None if st.session_state.custom_choice == "" else st.session_state.custom_choice
)
# If the user has made a selection
st.session_state.next = st.form_submit_button("Nästa", on_click=submitted)
if st.session_state.next:
if st.session_state.custom_choice:
print("CUSTOM CHOICE", st.session_state.custom_choice)
llm = LLM()
info = llm.generate(
f'Nedan är ett polisförhör där en person omnämns som "{unverified_person.name}".\n\n{interrogation_doc["text"]}\n\nSammanfatta informationen om {unverified_person.name} på ett detaljerat sätt, var noga med namn, platser, händelser och relationer. Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat. '
)
person_in_arango = db.collection("persons").insert(
{
"_key": arango.fix_key_name(st.session_state.custom_choice),
"name": st.session_state.custom_choice,
"info": [info],
"mentioned_in_interrogation": [interrogation_doc["_key"]],
"mentioned_as": [{unverified_person.name: interrogation_doc["_key"]}],
}
)
verify(
db,
"Yes",
unverified_person.doc,
found_person.doc,
interrogation_key=interrogation_doc["_key"],
)
elif st.session_state.unconfirmed_choice:
unconfirmed_choice = st.session_state.unconfirmed_choice.replace(
"*", ""
).strip()
print_yellow("OTHER CHOICE", unconfirmed_choice)
doc = db.collection("persons").get(
st.session_state.persons_dict[unconfirmed_choice]
)
found_person = FoundPerson(
db, unconfirmed_choice, st.session_state.persons_dict[unconfirmed_choice]
)
print("NEW:", found_person.name)
verify(db, "Yes", unverified_person, found_person, interrogation_doc["_key"])
elif st.session_state.user_choice == "Ja":
print("USER CHOICE", st.session_state.user_choice)
if "mentioned_as" not in found_person.doc:
found_person.doc["mentioned_as"] = []
if not check_if_dict_in_list(
{person["name"]: interrogation_doc["_key"]},
found_person.doc["mentioned_as"],
):
found_person.doc["mentioned_as"].append(
{person["name"]: interrogation_doc["_key"]}
)
verify(
db,
answer="Yes",
person=person,
person_in_arango=found_person.doc,
interrogation_key=interrogation_doc["_key"],
)
elif st.session_state.user_choice == "Nej":
verify(
db,
"No",
unverified_person=unverified_person.doc,
found_person=found_person.doc,
interrogation_key=interrogation_doc["_key"],
)
elif st.session_state.user_choice == "Vet ej":
verify(
db,
"Unknown",
unverified_person=unverified_person.doc,
interrogation_key=interrogation_doc["_key"],
)
reset_choices()
if not param_person_key:
if st.session_state.suggestions != []:
st.session_state.suggestion = st.session_state.suggestions.pop(0)
else:
get_unverified_person()
get_suggestions(st.session_state.unverified_person.__dict__)
st.session_state.suggestion = st.session_state.suggestions.pop(0)
st.rerun()
else:
st.markdown(":green[Tack!] Du kan stäna de här fliken nu.")
st.stop()

@ -43,35 +43,34 @@ class ArangoDB:
return string
def get_persons(self, confirmed=True):
"""
Gets a list of all names in the database.
"""
Retrieves a list of persons from the database.
Args:
confirmed (bool, optional): If True, only returns names of confirmed persons.
If False, returns names of all persons.
Defaults to True.
Args:
confirmed (bool, optional): If True, only retrieves confirmed persons. Defaults to True.
Returns:
dict: A dictionary containing two lists:
- 'names': A list of all names in the database.
- 'dict_persons': A list of dictionaries, where each dictionary contains the name and key of a person.
"""
confirmed_string = ''
if confirmed:
confirmed_string = 'filter person.confirmed == true'
Returns:
dict: A dictionary containing two keys:
- 'names': A list of person names.
- 'dict': A dictionary mapping person names to their corresponding keys.
"""
confirmed_string = ''
if confirmed:
confirmed_string = 'filter person.confirmed == true'
query = f"""
FOR person IN persons
{confirmed_string}
RETURN {{'name': person.name, '_key': person._key}}
"""
persons = [i for i in self.db.aql.execute(query)]
names = [document['name'] for document in persons]
dict_persons = {document['name']: document['_key'] for document in persons}
return {'names': names, 'dict':dict_persons}
query = f"""
FOR person IN persons
{confirmed_string}
RETURN {{'name': person.name, '_key': person._key}}
"""
persons = [i for i in self.db.aql.execute(query)]
names = [document['name'] for document in persons]
dict_persons = {document['name']: document['_key'] for document in persons}
return {'names': names, 'dict':dict_persons}
arango = ArangoDB()
db = arango.db
if __name__ == '__main__':
arango = ArangoDB()
print(arango.db)
print(len(arango.get_persons(confirmed=False)['names']))

@ -4,7 +4,8 @@ from chromadb.config import Settings
from chromadb.api.client import Client
from chromadb.api.models.Collection import Collection
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
class ChromaDB:
"""
@ -24,17 +25,21 @@ class ChromaDB:
host=host,
port=port,
)
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
api_key='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',
model_name="KBLab/sentence-bert-swedish-cased"
)
self.embedding_function: embedding_functions = huggingface_ef
# huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
# api_key="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",
# model_name="KBLab/sentence-bert-swedish-cased",
# )
self.embedding_function: embedding_functions = (
embedding_functions.SentenceTransformerEmbeddingFunction(
model_name="KBLab/sentence-bert-swedish-cased"
)
)
def print_collections(self):
"""
Prints all collections in the database.
"""
collections = self.client.list_collections()
collections: Collection = self.client.list_collections()
for collection in collections:
print(collection.name)
@ -49,7 +54,9 @@ class ChromaDB:
None
"""
collection = self.client.get_collection("mala_persons")
collection = self.client.get_or_create_collection(
"mala_persons", embedding_function=self.embedding_function
)
# Lists to store the documents, metadatas and ids
documents = []
@ -57,7 +64,11 @@ class ChromaDB:
ids = []
documents.append(person["name"])
metadata = {"name": person["name"], "_key": person["_key"], 'info': "\n".join(person["info"])}
metadata = {
"name": person["name"],
"_key": person["_key"],
"info": "\n".join(person["info"]),
}
metadatas.append(metadata)
ids.append(person["_key"])
@ -78,8 +89,10 @@ class ChromaDB:
"""
from _arango import arango
self.client.delete_collection('mala_persons')
col = self.client.get_or_create_collection('mala_persons')
self.client.delete_collection("mala_persons")
col = self.client.get_or_create_collection(
"mala_persons", embedding_function=self.embedding_function
)
db = arango.db
q = "for doc in persons filter doc.confirmed == true return doc"
@ -88,18 +101,60 @@ class ChromaDB:
for person in persons:
self.add_person_to_chroma(person)
print('Persons in chroma:', col.count())
print("Persons in chroma:", col.count())
def add_all_person_info(self):
"""
Adds all person information to the Chroma database.
"""
from _arango import arango
try:
self.client.delete_collection("mala_persons_info")
except:
pass
col = self.client.get_or_create_collection(
"mala_persons_info", embedding_function=self.embedding_function
)
persons = list(arango.db.collection("persons").all())
for person in persons:
doc = person["name"] + "\n" + "\n".join(person["info"])
col.add(
documents=[doc],
metadatas=[{"name": person["name"], "_key": person["_key"]}],
ids=[person["_key"]],
)
def query(self, collection, query_texts, n_results=5, where={}):
if isinstance(query_texts, str):
query_texts = [query_texts]
col = self.client.get_collection(
collection, embedding_function=self.embedding_function
)
return col.query(query_texts=query_texts, n_results=n_results, where=where, )
def add_interrogations():
from _arango import db
from langchain_text_splitters import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=1000,
chunk_overlap=100,
length_function=len,
is_separator_regex=False,
)
interrogatons = list(db.collection('interrogations').all())
for interrogation in interrogatons:
chunks = text_splitter.split_text(interrogation['text'])
for chunk in chunks:
# Initialize the ChromaDB object
chroma = ChromaDB()
if __name__ == '__main__':
chroma = ChromaDB()
chroma.add_all_persons_to_chroma()
if __name__ == "__main__":
chroma.print_collections()
#chroma.add_all_persons_to_chroma()
#chroma.add_all_person_info()

@ -3,11 +3,11 @@ import requests
import concurrent.futures
import queue
import threading
from pprint import pprint
import re
from dotenv import load_dotenv
import os
import json
from print_color import *
load_dotenv()
@ -16,11 +16,13 @@ class LLM:
def __init__(
self,
chat: bool = False,
model: str = "llama3:8b-instruct-q5_K_M",
model: str = os.getenv("LLM_MODEL"),
keep_alive: int = 3600 * 24,
start: bool = False,
system_prompt: str = 'Svara alltid på svenska. Svara bara på det som efterfrågas. Om du inte kan svara, skriv "Jag vet inte".',
temperature: str = 0,
stream=False,
small=False
):
"""
Initializes an instance of MyClass.
@ -34,26 +36,31 @@ class LLM:
which processes requests concurrently. Defaults to False.
"""
self.model = model
self.server = os.getenv("LLM_URL")
self.port = os.getenv("LLM_PORT")
self.model = model
if small:
self.model = os.getenv("LLM_SMALL_MODEL")
self.server = os.getenv("LLM_SMALL_URL")
self.port = os.getenv("LLM_SMALL_PORT")
self.temperature = temperature
self.system_message = {"role": "system", "content": system_prompt}
self.messages = [self.system_message]
self.chat = chat
self.max_tokens = 24000
self.max_length = 24000
self.keep_alive = keep_alive
self.request_queue = queue.Queue()
self.result_queue = queue.Queue()
self.all_requests_added_event = threading.Event()
self.all_results_processed_event = threading.Event()
self.stop_event = threading.Event()
self.stream = stream
if start:
self.start()
def generate(self, message):
def create_data_request(self, message):
# Remove leading and trailing whitespace
message = '\n'.join(line.strip() for line in message.split('\n'))
@ -73,32 +80,46 @@ class LLM:
"messages": messages,
"options": options,
"keep_alive": self.keep_alive,
"stream": False,
"stream": self.stream,
}
return data
def generate_stream(self, message):
# Make a POST request to the API endpoint
result = requests.post(
f"http://{self.server}:{self.port}/api/chat", json=data
).json()
data = self.create_data_request(message)
# print_data = result.copy()
# del print_data["message"]
# del print_data["model"]
response = requests.post(
f"http://{self.server}:{self.port}/api/chat", json=data, stream=True
)
# # Convert durations from nanoseconds to seconds
# for key in ['eval_duration', 'total_duration']:
# if key in print_data:
# duration = print_data[key] / 1e9 # Convert nanoseconds to seconds
# minutes, seconds = divmod(duration, 60) # Convert seconds to minutes and remainder seconds
# print_data[key] = f'{int(minutes)}:{seconds:02.0f}' # Format as minutes:seconds
# Iterate over the response
# Iterate over the response
for line in response.iter_lines():
# Filter out keep-alive new lines
if line:
decoded_line = line.decode('utf-8')
json_line = json.loads(decoded_line) # Parse the line as JSON
yield json_line['message']['content']
# pprint(print_data)
# print('Number of messages', len(messages))
if "message" in result:
answer = result["message"]["content"]
else:
pprint(result)
def generate(self, message):
data = self.create_data_request(message)
# Make a POST request to the API endpoint
result = requests.post(
f"http://{self.server}:{self.port}/api/chat", json=data
)
try:
if 'message' in result.json():
answer = result.json()["message"]["content"]
else:
print_red(result.content)
raise 'Error occurred during API request'
except requests.exceptions.JSONDecodeError:
print_red(result.content)
raise Exception("Error occurred during API request")
if self.chat:
@ -196,15 +217,15 @@ class LLM:
# Add the new message to the list
self.messages.append({"role": "user", "content": message})
# Calculate the total token length of the messages
total_tokens = sum([len((msg["content"])) for msg in self.messages])
# Calculate the total length of the messages
total_length = sum([len((msg["content"])) for msg in self.messages])
# While the total token length exceeds the limit, remove the oldest messages
while total_tokens > self.max_tokens:
# While the total length exceeds the limit, remove the oldest messages
while total_length > self.max_length:
removed_message = self.messages.pop(
1
) # Remove the oldest message (not the system message)
total_tokens -= len((removed_message["content"]))
total_length -= len((removed_message["content"]))
def unload_model(self):
data = {
@ -228,14 +249,13 @@ if __name__ == "__main__":
parser.add_argument("--unload", action="store_true", help="Unload the model")
args = parser.parse_args()
# llm = LLM(model='llama3:70b-text-q4_K_M', keep_alive=6000, chat=True)
llm = LLM(keep_alive=6000, chat=True)
llm = LLM(keep_alive=60000, chat=True, small=False)
if args.unload:
llm.unload_model()
else:
while True:
message = input(">>> ")
message = '''Hej
bad är kul'''
print(llm.generate(message))

@ -0,0 +1,76 @@
from openai import OpenAI, RateLimitError
from dotenv import load_dotenv
import os
from _llm import LLM as LLM_ollama
from print_color import *
from time import sleep
load_dotenv()
class LLM_OpenAI:
def __init__(
self,
system_prompt='Svara alltid på svenska. Svara bara på det som efterfrågas. Om du inte kan svara, skriv "Jag vet inte".',
chat=False,
model="gpt-3.5-turbo-0125",
max_tokens=24000,
sleep_time=0
):
self.chat = chat
self.model = model
self.temperature=0
self.max_tokens = max_tokens
self.system_message = {"role": "system", "content": system_prompt}
self.messages =[self.system_message]
self.client = OpenAI(
# This is the default and can be omitted
api_key=os.getenv("OPEN_AI"),
)
self.llm_ollama = LLM_ollama(chat=False, stream=True) # For backup
self.sleep_time = sleep_time
def build_message(self, message):
# Add the new message to the list
self.messages.append({"role": "user", "content": message})
# Calculate the total token length of the messages
total_tokens = sum([len((msg["content"])) for msg in self.messages])
# While the total token length exceeds the limit, remove the oldest messages
while total_tokens > self.max_tokens:
removed_message = self.messages.pop(
1
) # Remove the oldest message (not the system message)
total_tokens -= len((removed_message["content"]))
def generate(self, prompt, stream=False, local=False):
sleep(self.sleep_time)
if self.chat:
self.build_message(prompt)
messages = self.messages
else:
messages = [self.system_message, {"role": "user", "content": prompt}]
print(sum([len((msg["content"])) for msg in messages]))
if local:
response = self.llm_ollama.generate_stream(prompt)
else:
try:
response = self.client.chat.completions.create(
messages=messages,
model=self.model,
stream=stream
)
except RateLimitError as e:
print_red(e)
response = self.llm_ollama.generate_stream(prompt)
if stream:
return response
else:
answer = response.choices[0].message.content
if self.chat:
self.messages.append({"role": "assistant", "content": answer})
return answer

@ -0,0 +1,91 @@
import streamlit as st
import fitz
from fitz import Page, Document
from _llm import LLM
import re
from person_identifier import PersonFinder
from print_color import *
def set_name():
st.session_state.name = st.session_state.names.pop(0)
def highlight_name_in_pdf(page: Page, name: str):
# Search for the word in the page
rectangles = page.search_for(name, quads=True)
# Highlight the found words
page.add_highlight_annot(rectangles)
# Convert the page to a pixmap
pixmap = page.get_pixmap(dpi=300)
# Save the pixmap to a new image file
pixmap.save(image_filename, 'png')
def show_image(filename):
# Display the new image file in Streamlit
st.image(filename)
def get_page(page_number):
# Create a new document
new_doc = fitz.open()
new_doc.insert_pdf(doc, from_page=page_number, to_page=page_number)
page = new_doc[0]
page.set_cropbox(fitz.Rect(0, 100, 520, 800))
return page
@st.cache_resource()
def get_extractor():
return PersonFinder()
st.set_page_config(layout="wide")
filename = "Förhörsprotokoll.pdf"
image_filename = "highlighted.png"
page_number = 89
if 'doc' not in st.session_state:
st.session_state.doc = fitz.open(filename)
doc = st.session_state.doc
page = get_page(page_number)
if 'all_names' not in st.session_state:
st.session_state.all_names = {}
if 'names' not in st.session_state or st.session_state.names is None:
person_extractor = PersonFinder(st.session_state.all_names)
st.session_state.names = person_extractor.extract_names(person_extractor, page.get_text())
st.session_state.all_names = person_extractor.names
print_blue(st.session_state.names)
print_purple(st.session_state.all_names)
names = st.session_state.names
if 'name' not in st.session_state:
st.session_state.name = names.pop(0)
name = st.session_state.name
st.markdown(f'#### {name}')
highlight_name_in_pdf(page, name)
col1, col2 = st.columns([5,2])
with col1:
show_image(image_filename)
with col2:
next = st.button("Next", on_click=set_name)
if next:
if len(names) == 0:
st.session_state.names = None
else:
highlight_name_in_pdf(page, st.session_state.name)

@ -0,0 +1,43 @@
from _arango import db
from _llm import LLM
from langchain_text_splitters import CharacterTextSplitter
from print_color import *
interrogations = list(db.aql.execute('for doc in interrogations filter doc.formatted_text == null return doc', count=True))
text_splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=2000,
chunk_overlap=0,
length_function=len,
is_separator_regex=False,
)
for interrogation in interrogations:
text = interrogation['text']
chunks = text_splitter.split_text(text)
formated_chunks = []
for chunk in chunks:
print_yellow(len(chunk))
llm = LLM(chat=False, system_prompt='Du formaterar text enligt med markdown för att göra den lättare att läsa. Använd inte rubriker, bara fet och stil. Om det förekommer en dialog fetmarkera den som talar, exempelvis ** DH: **. Namn ska göras fetade, även om det bara är ett förnamn. Svara alltid med EXAKT samma text som du fick, men formaterad. Svara alltid på svenska.')
if 'name' not in interrogation:
interrogation['name'] = interrogation['person']
name = interrogation['name']
prompt = f'''Kolla på texten nedan: \n\n\n{chunk}\n\n\nJag vill att du svarar med EXAKT samma text, men formaterad enligt markdown för att vara enklare att läsa. Formatera enligt följande:
- Använd aldrig rubriker (#)
- Om det är en längre dialog mellan förhörsledare (FL) och den hörde (DH) formatera dem med fetstil, exempelvis **DH: **.
- Gör namn personer fetade, även om det bara är ett förnamn. Den förhörde {name} ska inte vara fetad utan normal text.
Ibland är styckeindelningen inte korrekt, försök att göra det lättare att läsa.
Svara ENBART med den formaterade texten, ingenting annat.'''
formatted_chunk = llm.generate(prompt)
print_blue(formatted_chunk)
formated_chunks.append(formatted_chunk)
formatted_text = '\n '.join(formated_chunks)
interrogation['formatted_text'] = formatted_text
db.collection('interrogations').update(interrogation, check_rev=False)

@ -1,4 +1,8 @@
from _arango import arango
from _chroma import ChromaDB
from langchain_text_splitters import CharacterTextSplitter
from print_color import *
from _llm import LLM
def truncate():
arango.db.collection("other_persons").truncate()
@ -8,55 +12,63 @@ def truncate():
def clear_info_persons():
persons = list(arango.db.collection("persons").all())
for person in persons:
if 'other' in person:
if person['other']:
arango.db.collection('persons').delete(person)
continue
person['info'] = []
person['mentioned_in_interrogation'] = []
arango.db.collection('persons').update(person, merge=False)
if not person['confirmed']:
arango.db.collection("persons").delete(person)
continue
person["info"] = []
person["mentioned_in_interrogation"] = []
person["mentioned_as"] = {}
arango.db.collection("persons").update(person, merge=False)
def clear_changer_interrogations():
interrogations = list(arango.db.collection("interrogations").all())
for interrogation in interrogations:
interrogation['mentioned_persons'] = []
arango.db.collection('interrogations').update(interrogation, merge=False)
interrogation["mentioned_persons"] = []
arango.db.collection("interrogations").update(interrogation, merge=False)
def clean_mentioned_as():
persons = list(arango.db.collection("persons").all())
for person in persons:
if 'mentioned_as' in person:
if "mentioned_as" in person:
mentioned_as = []
for i in person['mentioned_as']:
for i in person["mentioned_as"]:
if i not in mentioned_as:
mentioned_as.append(i)
person['mentioned_as'] = mentioned_as
arango.db.collection('persons').update(person, merge=False)
db = arango.db
person["mentioned_as"] = mentioned_as
person['info'] = []
arango.db.collection("persons").update(person, merge=False)
cursor = db.aql.execute('for doc in rumors return doc')
rumors = list(cursor)
for rumor in rumors:
rumor['class'] = rumor['class'].replace('.', '').strip().lower()
db.collection('rumors').update(rumor, merge=False)
text_splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=1000,
chunk_overlap=100,
length_function=len,
is_separator_regex=False,
)
db = arango.db
# persons = list(arango.db.collection("persons").all())
# for person in persons:
# for interrogation in person['interrogations']:
interrogations = list(db.aql.execute('for doc in interrogations filter doc.person_mentioned_as == null return doc'))
pms = list(db.aql.execute('for doc in pms return {"_id": doc._id, "page": doc.page}'))
interrogations = interrogations + pms
# arango.db.collection('interrogations').update({'_key': interrogation, 'person_id': person['_id']}, )
# print(f"Updated {interrogation} with person_id {person['_id']}")
interrogations.sort(key=lambda x: x['page'])
# interrogations = list(arango.db.collection("interrogations").all())
for i in interrogations:
llm = LLM(chat=False)
if 'text' not in i:
continue
text = i['text'][:1000]
print_purple(text)
name = i['name']
prompt = f'''Nedan är ett förhör med {name}: \n\n\n{text}\n\n\nOm du ser till själva förhöret, vilket namn används för {name}? Om personen exempelvis bara skrivs ut med förnamn så vara med det. Svara ENBART med namnet, inget annat.'''
answer = llm.generate(prompt)
i['person_mentioned_as'] = answer
db.collection('interrogations').update(i, check_rev=False)
# for interrogation in interrogations:
# interrogation['person_id'] = 'persons/' + interrogation['person'].replace('persons_', '')
# arango.db.collection('interrogations').update(interrogation, merge=False)

@ -3,6 +3,8 @@ import fitz
from _arango import arango
from openai import OpenAI
from pprint import pprint
from print_color import *
class Section:
def __init__(self, type, page, filename="Förhörsprotokoll.pdf"):
@ -15,14 +17,17 @@ class Section:
self.date = ""
self.start_page = page
self.key = ""
self.llm = LLM(chat=True)
def add_to_arango(self):
key = arango.fix_key_name(
f"{self.person}_{self.date}_p.{self.start_page}"
)
# prompt = f'Sammanfatta texten nedan: \n\n """{self.text[:7000]}""" \n\nSammanfattning:'
# llm = LLM(chat=False)
# self.summary = llm.generate(prompt)
# print_green(self.summary)
arango_doc = {
"_key": key,
"_key": self.key,
"person": self.person,
"role": self.role,
"topic": self.topic,
@ -30,44 +35,64 @@ class Section:
"page": self.start_page,
"text": self.text,
"filename": self.filename,
#"summary": self.summary,
"name": self.name,
}
arango.db.collection(self.type).insert(arango_doc, overwrite=True)
print(f"Added {self.type} to ArangoDB with key {key}")
arango.db.collection(self.type).insert(arango_doc, overwrite_mode="update")
print(f"Added {self.type} to ArangoDB with key {self.key}")
def extract_interrogation(self, text):
self.person = llm.generate(
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vem är förhörd? Svara på formen "Förnamn Efternamn" \n\nFörhörd person:'
self.person = self.llm.generate(
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vem är förhörd? Namnet står ofta på formen "Efternamn, Förnamn". Jag vill att su ska svara på formen "Förnamn Efternamn"'
)
self.role = llm.generate(
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vem är {self.person}? \n\nTitel på förhörd person:'
number_of_names = len(self.person.split(' '))
if number_of_names != 2:
first_name = self.llm.generate(
f'Personens formella namn är alltså {self.person}. Om du kollar på själva föhörstexten, är personens förnamn (det som personen kallas för)? Svara bara med ett förnamn.'
)
last_name = self.llm.generate(
f'Och i efternamn?'
)
self.name = f'{first_name} {last_name}'
print_rainbow(self.person, '->', self.name)
else:
self.name = self.person
self.role = self.llm.generate(
f'Vem är {self.person}? Svara så kort som möjligt med titel eller beskrivning.'
)
self.topic = llm.generate(
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vad handlade förhöret om? Svara så kortfattat som möjligt. \n\nFörhörets syfte:'
self.topic = self.llm.generate(
f'Vad handlade förhöret om? Svara så kortfattat som möjligt.'
)
self.date = llm.generate(
f'Kolla på texten nedan: \n\n """{text}""" \n\n När ägde förhöret rum? Svara på formen YYY-MM-DD \n\nFörhörsdatum:'
self.date = self.llm.generate(
f'När ägde förhöret rum? Svara på formen YYY-MM-DD'
)
self.key = arango.fix_key_name(f"{self.person}_{self.date}_p.{self.start_page}")
cursor = arango.db.aql.execute(f'for doc in interrogations filter doc.page == {self.start_page} return doc._key', count=True)
if cursor.count() == 1:
self.key = cursor.next()
else:
print_red("Could not find key")
self.key = arango.fix_key_name(f"{self.person}_{self.date}_p.{self.start_page}")
def extract_pm(self, text):
self.person = llm.generate(
self.person = self.llm.generate(
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vem är uppgiftslämnare? Svara på formen "Förnamn Efternamn" \n\nPM:'
)
self.role = llm.generate(
self.role = self.llm.generate(
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vem är {self.person}? Svara "None" om det inte framgår. \n\nTitel på person:'
)
self.topic = llm.generate(
self.topic = self.llm.generate(
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vad handlade informationen om? Svara så kortfattat som möjligt. Svara "None" om det inte framgår. \n\Svar:'
)
self.date = llm.generate(
self.date = self.llm.generate(
f'Kolla på texten nedan: \n\n """{text}""" \n\n När lämnades informationen? Svara på formen YYY-MM-DD \n\nDatum:'
)
self.key = arango.fix_key_name(f"{self.person}_{self.date}_p.{self.start_page}")
def new_interrogation(page, section):
@ -85,25 +110,7 @@ def new_pm(page, section):
section.extract_interrogation(page.get_text())
return section
# * Llama
llm = LLM(chat=False, model="llama3:8b-instruct-q5_K_M")
# Open the PDF file
filename = "Förhörsprotokoll.pdf"
area = fitz.Rect(0, 40, 520, 800) # To exlude the header
doc = fitz.open(f"pdfs/{filename}")
section = Section("interrogations", 0)
for page in doc.pages(9, len(doc) - 1):
# Get the text from the page
page_text = page.get_text("text")
def is_new_interrogation(page_text, page):
# Check if there is a new interrogation
control_words_interrogation = [
"Förhörsdatum",
@ -118,31 +125,58 @@ for page in doc.pages(9, len(doc) - 1):
if word in page_text:
n_control_words_interrogation += 1
if n_control_words_interrogation >= 2:
section = new_interrogation(page, section)
area = fitz.Rect(0, 400, 520, 800)
else:
# Check if there is a new PM
control_words_pm = [
print_purple('New interrogation', page)
return True
def is_new_pm(page_text, page):
control_words_pm = [
"PM",
"Uppgiften avser",
"Upprättad av",
"Sätt på vilket uppgift lämnats",
"Uppgiftslämnare",
]
n_control_words_pm = 0
for word in control_words_pm:
if word in page_text:
n_control_words_pm += 1
if n_control_words_pm >= 2:
area = fitz.Rect(0, 400, 520, 800)
section = new_pm(page, section)
else:
# It's a "normal" page
area = fitz.Rect(0, 40, 520, 800) # To exlude the header
n_control_words_pm = 0
for word in control_words_pm:
if word in page_text:
n_control_words_pm += 1
if n_control_words_pm >= 2:
print_blue('New PM', page)
return True
# * Llama
# Open the PDF file
filename = "Förhörsprotokoll.pdf"
area = fitz.Rect(0, 40, 520, 800) # To exlude the header
doc = fitz.open(f"/home/lasse/mala/Förhörsprotokoll.pdf")
section = Section("interrogations", 0)
for page in doc.pages(9, len(doc) - 1):
# Get the text from the page
page_text = page.get_text("text")
if is_new_interrogation(page_text, page):
section = new_interrogation(page, section)
area = fitz.Rect(0, 400, 520, 800)
elif is_new_pm(page_text, page):
# Check if there is a new PM
area = fitz.Rect(0, 400, 520, 800)
section = new_pm(page, section)
else:
# It's a "normal" page
area = fitz.Rect(0, 40, 520, 800) # To exlude the header
blocks = page.get_text("blocks", clip=area)
for block in blocks:
section.text += block[4] + "\n\n"
new_interrogation(page, section)

@ -1,27 +1,49 @@
import multiprocessing
from _llm import LLM
from _llm import LLM as LLM_garda
from _openai import LLM_OpenAI as LLM
#from _llm import LLM
from _arango import arango
from langchain_text_splitters import CharacterTextSplitter
import difflib
import re
import random
from time import sleep
import traceback
from pprint import pprint
from print_color import *
class Interrogation:
def __init__(self, _key, text):
self._key = _key
self.text = text
self.mentioned_persons = []
self.chunks = None
def check_name(person, answer_person, text):
print_yellow(person, " - ", answer_person)
same = False
# If full name similarity is below a threshold (e.g., 0.5), compare first names only
# If person only has one name, first or last, compare that to first and last name of answer_person
print('Length person:', len(person.strip().split()))
if len(person.strip().split()) == 1:
llm = LLM()
answer_first_name = answer_person.split()[0].strip()
answer_last_name = answer_person.split()[-1].strip()
first_name_similarity = difflib.SequenceMatcher(
None, person, answer_first_name
).ratio()
last_name_similarity = difflib.SequenceMatcher(
None, person, answer_last_name
).ratio()
print("First name similarity:", first_name_similarity)
print("Last name similarity:", last_name_similarity)
if difflib.SequenceMatcher(None, person, answer_first_name).ratio() > 0.9:
if answer_last_name in text:
same = True
@ -31,11 +53,13 @@ def check_name(person, answer_person, text):
i["name"].split()[0] for i in db.collection("persons").all()
]
first_name_count = first_names.count(answer_first_name)
print("First name count:", first_name_count)
if first_name_count == 1:
same = True
else:
llm = LLM_garda()
answer = llm.generate(
f'Nämns någon med efternamnet "{answer_last_name}" i texten nedan?\n\n"""{text[:6000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"'
f'Nämns någon med efternamnet "{answer_last_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"'
)
if "JA" in answer:
same = True
@ -44,8 +68,9 @@ def check_name(person, answer_person, text):
if answer_first_name in text:
same = True
else:
llm = LLM_garda()
answer = llm.generate(
f'Nämns någon med förnamnet "{answer_first_name}" i texten nedan?\n\n"""{text[:6000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"'
f'Nämns någon med förnamnet "{answer_first_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"'
)
if "JA" in answer:
same = True
@ -53,18 +78,9 @@ def check_name(person, answer_person, text):
else:
name_similarity = difflib.SequenceMatcher(None, person, answer_person).ratio()
print("Similarity:", name_similarity)
# person_first_name = person.split()[0]
# answer_person_first_name = answer_person.split()[0]
# first_name_similarity = difflib.SequenceMatcher(
# None, person_first_name, answer_person_first_name
# ).ratio()
# person_last_name = person.split()[-1]
# answer_person_last_name = answer_person.split()[-1]
# print("new:", name_similarity)
if name_similarity > 0.9:
if name_similarity > 0.85:
same = True
return same
@ -86,55 +102,50 @@ def execute_query_with_retry(db, query, max_retries=5, delay=2):
# Then, in your extract_persons function:
def extract_persons(interrogation):
known_persons = {
"Douglas": "Douglas Bengtsson",
"Rashid": "Rashid Sheiksaid",
"Emanuel": "Emanuel Johansson",
"Robert": "Robert Bengtsson",
}
sleep(random.uniform(0.05, 0.3))
print("INTERROGATION:", interrogation["_key"])
q = "for doc in persons filter doc.other != true return doc"
result = execute_query_with_retry(db, q)
persons_docs = list(result)
persons = [i["name"].strip() for i in persons_docs]
first_names = {i["name"].split()[0].strip(): i["name"] for i in persons_docs}
persons_dict = {i["name"]: i for i in persons_docs}
text_splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=4000,
chunk_overlap=0,
length_function=len,
is_separator_regex=False,
)
chunks = text_splitter.split_text(interrogation["text"])
def extract_persons(interrogation, names_interrogation):
llm = LLM(
chat=True,
system_prompt="Du är en assistent som hjälper till att hitta personer i ett polisförhör. Du får en del av texten från förhöret åt gången. Svara bara när personen finns i den del du får, hitta inte på personer.",
system_prompt="Du är en assistent som hjälper till att hitta personer i ett polisförhör. Svara bara när personen finns i den del du får, hitta inte på personer.",
)
names = []
for chunk in chunks:
# Find persons in the text
prompt = f'''Det här är en text från ett polisförhör där {interrogation["person"]} förhörs:\n
"""{chunk}"""\n
Vilka personer nämns i texten som inte förekommit tidigare? Svara ENBART med en pythonformaterad lista av namn.
Exempel svar för att du ska förstå formen: "["namn1", "namn2", "namn3"]".
Jag är inte intresserad av förhörsledaren eller personen som förhörs.'''
response = llm.generate(prompt)
response = re.sub(r"[^a-zA-ZåäöÅÄÖ\- ,]", "", response).replace(" namn ", "")
for name in [i.strip() for i in response.split(",") if len(i) > 2]:
if name not in names:
# Find persons in the text
prompt = f'''Det här är en text från ett polisförhör där {interrogation["person"]} förhörs:\n
"""{chunk}"""\n
Vilka personer nämns i texten? Svara ENBART med en pythonformaterad lista av namn.
Exempel svar för att du ska förstå formen: "["namn1", "namn2", "namn3"]".
Jag är inte intresserad av förhörsledaren eller personen som förhörs.'''
response = llm.generate(prompt)
response = re.sub(r"[^a-zA-ZåäöÅÄÖ\- ,]", "", response).replace(" namn ", "")
for name in [i.strip() for i in response.split(",") if len(i) > 2]:
if name not in names and name not in names_interrogation:
same_name = False
if names_interrogation != []:
for name_interrogation in list(names_interrogation):
if name in name_interrogation:
same_name = True
names_interrogation[name] = names_interrogation[name_interrogation]
person_arango = db.aql.execute('for doc in persons filter doc.name == @name return doc', bind_vars={'name': names_interrogation[name_interrogation]}, count=True)
if person_arango:
person_arango = list(person_arango)[0]
if interrogation["_key"] not in person_arango["mentioned_as"]:
person_arango["mentioned_as"][interrogation["_key"]] = [name]
else:
if name not in person_arango["mentioned_as"][interrogation["_key"]]:
person_arango["mentioned_as"][interrogation["_key"]].append(name)
db.collection("persons").update(person_arango, check_rev=False)
if not same_name:
names.append(name)
else:
print_green('Name already in names_interrogation', name)
return names, names_interrogation
def identify_persons(names, chunk, names_interrogation):
for name in names:
print_blue('New name:', name)
# Compare the person to a list of known persons
prompt = f'''Jag vill veta vem {name} är. Kolla på förhöret nedan och svara om du hittar något om personen där.
"""{chunk}"""\n
@ -151,6 +162,7 @@ def extract_persons(interrogation):
elif name.split().reverse() in persons:
print("Vände och hittade ✌", name.split().reverse())
person = persons_dict[name.split().reverse()]
else:
closest_matches = difflib.get_close_matches(name, persons, n=4, cutoff=0.3)
@ -160,8 +172,8 @@ def extract_persons(interrogation):
persons_string = "\n".join(closest_matches)
prompt = f"""Jag behöver identifiera {name}. Nedan är en lista på personer det kanske skulle kunna vara:\n
{persons_string}\n
Är {name} någon av dessa personer? I texten kan personen stå med bara sitt förnamn eller efternamn, kolla speciellt efter namn i listan där förnamnet eller efternamnet stämmer. Namnet kan också vara felstavat, men inte ett helt annat namn.
Svara BARA med namnet personen ur listan. Är du inte säker svara "None"."""
Är {name} någon av dessa personer? I texten kan personen stå med bara sitt förnamn eller efternamn, kolla speciellt efter namn i listan där förnamnet eller efternamnet stämmer. Namnet i förhöret kan också vara felstavat, exempelvis ett s istället för två eller kan bokstäver ha bytt plats, men inte ett helt annat namn.
Svara BARA med namnet personen ur listan. Är det inte någon av personerna i listan svara "None"."""
answer_person = llm.generate(prompt)
if answer_person in persons and check_name(
@ -169,35 +181,35 @@ def extract_persons(interrogation):
):
person = persons_dict[answer_person]
else:
print_red(f"""Answer "{answer_person}" not in persons""")
if person:
print_green(f'{name} identified: {person["name"]}', "\n")
if name not in names_interrogation:
names_interrogation[name] = person['name']
print_green(f'{name} identified: {person["name"]}', "\n")
if "info" not in person:
person["info"] = []
if info not in person["info"]:
person["info"].append(info)
if "mentioned_in_interrogation" not in person:
person["mentioned_in_interrogation"] = []
if interrogation["_key"] not in person["mentioned_as"]:
person["mentioned_as"][interrogation["_key"]] = [name]
else:
if name not in person["mentioned_as"][interrogation["_key"]]:
person["mentioned_as"][interrogation["_key"]].append(name)
if interrogation["_key"] not in person["mentioned_in_interrogation"]:
person["mentioned_in_interrogation"].append(interrogation["_key"])
if "mentioned_as" not in person:
person["mentioned_as"] = []
if {name: interrogation["_key"]} not in person["mentioned_as"]:
person["mentioned_as"].append({name: interrogation["_key"]})
db.collection("persons").update(person, check_rev=False)
# db.collection("all_relations").insert(
# {
# "_from": interrogation["person_id"],
# "_to": person["_id"],
# "relation": "mentioned_by",
# "mentions": [{'interrogation': interrogation["_key"], "date": interrogation["date"], "mentioned_as": info}],
# "_key": f'{interrogation["_key"]}_{person["_key"]}'
# },
# overwrite_mode="update",
# merge=True,
# )
# If the person was not identified as a confirmed person, add to the unconfirmed persons
else:
if name not in names_interrogation:
names_interrogation[name] = name
print(f"\033[91m{name} not identified\033[0m")
print_yellow(
"\n".join([f"- {i}" for i in persons_string.split("\n")]), "\n"
@ -207,45 +219,41 @@ def extract_persons(interrogation):
_key = arango.fix_key_name(name) #TODO Are there multiple persons with the same name?
# If no confirmed person was identified, create a new person or add to another unconfirmed person
if not db.collection("persons").get(_key):
if db.collection("persons").get(_key):
doc = db.collection("persons").get(_key)
doc = db.collection("persons").get(_key)
if doc:
if interrogation["_key"] not in doc["mentioned_as"]:
doc["mentioned_as"][interrogation["_key"]] = [name]
else:
doc = {
"_key": _key,
"name": name,
"info": [info],
"other": True,
"confirmed": False,
"mentioned_in_interrogation": [interrogation["_key"]],
}
if name not in doc["mentioned_as"][interrogation["_key"]]:
doc["mentioned_as"][interrogation["_key"]].append(name)
else:
doc = db.collection("persons").get(_key)
if interrogation["_key"] not in doc["mentioned_in_interrogation"]:
doc["mentioned_in_interrogation"].append(interrogation["_key"])
if interrogation["_key"] not in doc["mentioned_in_interrogation"]:
doc["mentioned_in_interrogation"].append(interrogation["_key"])
if info not in doc["info"]:
doc["info"].append(info)
else:
doc = {
"_key": _key,
"name": name,
"info": [info],
"confirmed": False,
"mentioned_in_interrogation": [interrogation["_key"]],
"mentioned_as": {interrogation["_key"]: [name]},
}
db.collection("persons").insert(doc, merge=False, overwrite_mode='update')
if person and person['_key'] not in interrogation["mentioned_persons"]:
interrogation["mentioned_persons"].append(person['_key'])
db.collection("interrogations").update(interrogation, check_rev=False)
# db.collection("all_relations").insert(
# {
# "_from": interrogation["person_id"],
# "_to": doc["_id"],
# "relation": "mentioned_by",
# 'other': True,
# "mentions": [{'interrogation': interrogation["_key"], "date": interrogation["date"], "mentioned_as": info}],
# "_key": f'{interrogation["_key"]}_{doc["_key"]}'
# },
# overwrite_mode="update",
# merge=True,
# )
if __name__ == "__main__":
db = arango.db
q = 'for doc in interrogations filter doc._key == "Markus_Lindahl_2023-02-20_p.93" return doc'
q = 'for doc in interrogations return doc'
interrogations = list(db.aql.execute(q))
interrogations.sort(key=lambda x: x["date"])
@ -276,9 +284,41 @@ if __name__ == "__main__":
# rumors = list(db.aql.execute(q))
# interrogations = [interrogation for interrogation in interrogations if interrogation['_key'] not in rumors]
# print('Number of interrogations to process:', len(interrogations))
print(len(interrogations))
for interrogation in interrogations:
extract_persons(interrogation)
names_interrogation = {}
known_persons = {
"Douglas": "Douglas Bengtsson",
"Rashid": "Rashid Sheiksaid",
"Emanuel": "Emanuel Johansson",
"Robert": "Robert Bengtsson",
"Marlene": "Marlene Ahlqvist",
"Jhonny": "Jhonny Backman",
}
sleep(random.uniform(0.05, 0.3))
print("INTERROGATION:", interrogation["_key"])
q = "for doc in persons filter doc.confirmed == true return doc"
result = execute_query_with_retry(db, q)
persons_docs = list(result)
persons = [i["name"].strip() for i in persons_docs]
first_names = {i["name"].split()[0].strip(): i["name"] for i in persons_docs}
persons_dict = {i["name"]: i for i in persons_docs}
text_splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=8000,
chunk_overlap=0,
length_function=len,
is_separator_regex=False,
)
chunks = text_splitter.split_text(interrogation["text"])
for chunk in chunks:
names = extract_persons(interrogation)
exit()
with multiprocessing.Pool(processes=3) as pool:
pool.map(extract_persons, interrogations)

@ -5,17 +5,100 @@ from pprint import pprint
from pprint import pprint
from langchain_text_splitters import CharacterTextSplitter
import multiprocessing
from print_color import *
def describe_relation(person1, person2, relation, text):
llm = LLM(chat=False, system_prompt="Du ska hitta relationer i en text. Svara alltid enligt angiven form och alltid på svenska.")
prompt = f'''
I texten nedan beskrivs att {person1} och {person2} har relationen "{relation}". Läs texten och sammanfatta kortfattat vad som beskrivs om relationen mellan {person1} och {person2}:\n\n"""{text}"""\n
Svara ENBART med information om relationen, inga hälsningsfraser eller liknande.
Relationen ska vara kortfattad och stämma med texten. Om det inte går att beskriva relationen svara med "None".
Vad står det om relationen "{relation}" mellan {person1} och {person2}?
'''
response = llm.generate(prompt)
print_rainbow(relation, response)
return response
def find_relations(interrogation):
"""
Finds the relations in an interrogation.
Args:
interrogation (dict): The interrogation.
Returns:
None
"""
text = interrogation["text"]
text_splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=6000,
chunk_overlap=0,
length_function=len,
is_separator_regex=False,
)
chunks = text_splitter.split_text(text)
all_relations = []
llm = LLM(chat=False, system_prompt="Du ska hitta relationer i en text. Svara alltid enligt angiven form och alltid på svenska.")
for chunk in chunks:
prompt = f"""Nedan är en bit av ett polisförhör med {interrogation['person']}. Jag vill att du hittar alla relationer mellan identifierbara personer som beskrivs i själva förhöret:\n\n{chunk}\n\n
Svara formen "person1;person2;relation\n". Var noga med hur semikolon används för att skilja personerna och relationen, och ny rad efter varje relation (informationen ska sedan användas för en CSV fil). Svara svenska.
Nedan är ett påhittat exempel för att du ska förstå hur du kan svara:
<exempel>
person1;person2;gick grundskolan tillammans, spelade fotboll
person2;person3;gifta sedan 2022
</exempel>
Beskrivningen av relationen ska vara kortfattad och stämma med texten.
Om det inte finns någon relation, svara "None".
Svara ENBART med relationerna, INGENTING annat som en hälsning eller förklaring.
"""
response = llm.generate(prompt)
print_blue(response)
relations = response.split("\n")
for relation in relations:
if relation == "None" or ';' not in relation:
continue
try:
person1, person2, relation = relation.split(";", 2)
except ValueError as e:
print_red(f"Error: {e}")
print_red(f"Relation: {relation}")
continue
description = describe_relation(person1, person2, relation, chunk)
for r in all_relations:
p1 = r["from"]
p2 = r["to"]
if p1 == person1 and p2 == person2:
r["relations"].append({'relation': relation, 'description': description, 'chunk_number': chunks.index(chunk)})
break
else:
all_relations.append(
{
"_key": arango.fix_key_name(f"{person1}-{person2}_{interrogation['_key']}"),
"from": person1,
"to": person2,
"relations": [{'relation': relation, 'description': description, 'chunk_number': chunks.index(chunk)}],
'interrogation': interrogation['_key'],
"chunks": chunks,
}
)
return all_relations
if __name__ == "__main__":
db = arango.db
q = 'for doc in interrogations return doc'
interrogations = list(db.aql.execute(q))
for interrogation in interrogations:
relations = find_relations(interrogation)
db.collection('relations').insert_many(relations)
with multiprocessing.Pool() as pool:
results = pool.map(process_interrogation, interrogations)

@ -1,17 +1,25 @@
import re
from _arango import arango
from _llm import LLM
from _llm import LLM as LLM
from _openai import LLM_OpenAI as LLM
from pprint import pprint
from langchain_text_splitters import CharacterTextSplitter
import multiprocessing
from print_color import *
from time import sleep
class Interrogation:
def __init__(self, interrogation):
self.interrogation = interrogation
self.llm = LLM(chat=True)
self.llm_checker = LLM(chat=False)
self.llm = LLM(
chat=True,
system_prompt="Du hjälper en grävande journalist att analysera text. Granskningen handlar bland annat om sexuella övergrepp och olämpligheter, men även om andra typer av brott. Du ska svara på frågor om texten och hjälpa till att sammanfatta innehållet. Svara bara på det som efterfrågas. Om du inte kan svara, skriv 'Jag vet inte'. Om du inte kan hantera vissa typer av sexuellt innehåll, försök ändå då det är viktigt att allt innehåll granskas.",
sleep_time=20
)
self.llm_checker = LLM(
chat=False,
system_prompt="Du hjälper en grävande journalist att analysera text. Granskningen handlar bland annat om sexuella övergrepp och olämpligheter, men även om andra typer av brott. Du ska svara på frågor om texten och hjälpa till att sammanfatta innehållet. Svara bara på det som efterfrågas. Om du inte kan svara, skriv 'Jag vet inte'. Om du inte kan hantera vissa typer av sexuellt innehåll, försök ändå då det är viktigt att allt innehåll granskas.",
)
self.text = interrogation["text"]
# Info to collect
@ -72,6 +80,7 @@ class Interrogation:
"""{chunk}"""
Jag vill veta om någonting i förhöret handlar om eller anspelar något av:
- Sexuella olämpligheter
- Sexuella inviter
- Övergrepp
@ -292,8 +301,8 @@ if __name__ == "__main__":
]
print("Number of interrogations to process:", len(interrogations))
# for i in interrogations:
# process_interrogation(i)
for i in interrogations:
process_interrogation(i)
# exit()
with multiprocessing.Pool(3) as pool:
pool.map(process_interrogation, interrogations)
# with multiprocessing.Pool(3) as pool:
# pool.map(process_interrogation, interrogations)

@ -0,0 +1,26 @@
from _arango import db
from _llm import LLM
from print_color import *
relations = list(db.aql.execute('for doc in relations return doc', count=True))
for relation in relations:
interrogation = db.collection('interrogations').get(relation['interrogation'])
if not interrogation:
print_red(relation)
continue
for i in ['to', 'from']:
if 'name' not in interrogation:
interrogation['name'] = interrogation['person']
db.collection('interrogations').update(interrogation, check_rev=False)
if relation[i] == interrogation['name']:
relation[i] = interrogation['person_id']
elif relation[i] == interrogation['person_mentioned_as']:
relation[i] = interrogation['person_id']
for k, v in relation.items():
print_rainbow(k, v)
print()
for k, v in interrogation.items():
if k == 'text' or k == 'formatted_text':
continue
print_rainbow(k, v)
db.collection('relations').update(relation, check_rev=False)

@ -1,95 +1,261 @@
from _chroma import ChromaDB
from _arango import arango
from _chroma import chroma
from _arango import arango, db
from _llm import LLM
from pprint import pprint
from print_color import *
import multiprocessing
from typing import Union
import difflib
class Person:
def __init__(self):
self.info = None
self.summary = None
def find_person(person):
def make_summary(self):
llm = LLM(chat=False, small=True)
if len(self.info) > 100:
summary_prompt = f"""Nedan är olika bitar med information om en person:\n
{self.info}\n\nSammanfatta dessa ett detaljerat sätt. Var noga med namn, platser, händelser och relationer.
Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat. """
else:
summary_prompt = f"""Nedan är information om en person:\n
{self.info}\n\nSammanfatta denna information detaljerat som möjligt. Var noga med namn, platser, händelser och relationer.
Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat. """
self.summary = llm.generate(summary_prompt)
class UnverifiedPerson(Person):
def __init__(self, doc: dict, interrogation:str=None):
super().__init__()
self.doc = doc
for k, v in self.doc.items():
setattr(self, k, v)
if 'info' in doc:
self.info = "\n".join(doc["info"])
else:
self.info = None
if 'name' in doc:
self.name = doc["name"]
else:
self.name = ''
class FoundPerson(Person):
"""
Represents a person found in ArangoDB.
Attributes:
name (str): The name of the person.
info (str): Additional information about the person.
key (str): A unique identifier for the person.
doc (str): The persons document in ArangoDB.
summary (str): A summary of the person's details.
"""
Finds a person in the database based on the given person information.
def __init__(self, db, name, key):
super().__init__()
self.name = name
self.key = key
self.doc = db.collection("persons").get(key)
self.info = "\n".join(self.doc["info"])
self.summary = self.make_summary()
def check_name(person, answer_person, text):
print_yellow(person, " - ", answer_person)
same = False
# If person only has one name, first or last, compare that to first and last name of answer_person
print('Length person:', len(person.strip().split()))
if len(person.strip().split()) == 1:
llm = LLM()
answer_first_name = answer_person.split()[0].strip()
answer_last_name = answer_person.split()[-1].strip()
first_name_similarity = difflib.SequenceMatcher(
None, person, answer_first_name
).ratio()
last_name_similarity = difflib.SequenceMatcher(
None, person, answer_last_name
).ratio()
print("First name similarity:", first_name_similarity)
print("Last name similarity:", last_name_similarity)
if difflib.SequenceMatcher(None, person, answer_first_name).ratio() > 0.9:
if answer_last_name in text:
same = True
else:
# Count how many time the first name appears in the first_names list
first_names = [
i["name"].split()[0] for i in db.collection("persons").all()
]
first_name_count = first_names.count(answer_first_name)
print("First name count:", first_name_count)
if first_name_count == 1:
same = True
else:
llm = LLM(small=True)
answer = llm.generate(
f'Nämns någon med efternamnet "{answer_last_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"'
)
if "JA" in answer:
same = True
elif difflib.SequenceMatcher(None, person, answer_last_name).ratio() > 0.9:
if answer_first_name in text:
same = True
else:
llm = LLM(small=True)
answer = llm.generate(
f'Nämns någon med förnamnet "{answer_first_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"'
)
if "JA" in answer:
same = True
else:
name_similarity = difflib.SequenceMatcher(None, person, answer_person).ratio()
print("Similarity:", name_similarity)
if name_similarity > 0.85:
same = True
return same
def find_with_llm(unverified_person: UnverifiedPerson):
unverified_person.make_summary()
def find_person(
unverified_person: Union[dict, UnverifiedPerson, str] = None,
name: str = None,
key: str = None,
):
"""
Finds a person in the Chroma db.
Args:
person (dict): A dictionary containing information about the person.
unverified_person (Union[dict, UnverifiedPerson]): The unverified person to idetify.
Returns:
list: A list of tuples containing the following information:
- generated answer (str): The generated answer from the language model.
- person information (dict): Information about the matched person in the database.
- interrogation document (dict): The document containing the interrogation text.
- mentioned person name (str): The name of the person mentioned in the interrogation.
- matched person name (str): The name of the person matched in the database.
- original person information (dict): The original information about the person.
FoundPerson: The found person
"""
db = arango.db
llm = LLM()
other_person = person["name"]
if not isinstance(unverified_person, UnverifiedPerson):
if unverified_person is None:
unverified_person = {}
if name:
unverified_person['name'] = name
if key:
unverified_person['_key'] = key
chroma = ChromaDB()
col = chroma.client.get_or_create_collection("mala_persons")
if "is_not" not in other_person:
filter_isnot = {}
unverified_person = UnverifiedPerson(unverified_person)
if "is_not" in unverified_person.doc:
list_filter_isnot = [unverified_person.name].append(
unverified_person.doc["is_not"]
)
else:
filter_isnot = {"name": {"$nin": other_person["is_not"]}}
list_filter_isnot = [unverified_person.name]
filter_isnot = {"name": {"$nin": list_filter_isnot}}
# Do a query to find the person
hits = col.query(query_texts=[other_person], n_results=1, where=filter_isnot)
query_results = chroma.query(
query_texts=[unverified_person.name],
n_results=1,
where=filter_isnot,
collection="mala_persons",
)
found_person = hits["documents"][0][0]
found_person_key = hits["metadatas"][0][0]["_key"]
distance = hits["distances"][0][0]
distance = query_results["distances"][0][0]
print_purple(query_results["metadatas"][0][0]["name"], distance)
# * Filter out hits with distance > 1
if distance > 1:
return []
found_person_in_arango = db.collection("persons").get(found_person_key)
found_person_info = "\n".join(found_person_in_arango["info"])
unverified_person.make_summary()
query_results = chroma.query(
query_texts=[unverified_person.summary],
n_results=1,
where=filter_isnot,
collection="mala_persons_info",
)
distance = query_results["distances"][0][0]
print_yellow(query_results["metadatas"][0][0]["name"], distance)
if distance > 1:
return None
# return unverified_person, found_person, False
print_red("NAME", query_results["documents"][0][0])
found_person = FoundPerson(
db,
name=query_results["metadatas"][0][0]["name"],
key=query_results["metadatas"][0][0]["_key"],
)
return found_person
prompt = f"Nedan är olika bitar med information om en person:\n\n{found_person_info}\n\nSammanfatta dessa på ett detaljerat sätt, var noga med namn, platser, händelser och relationer. Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat. "
person_in_arango_summary = llm.generate(prompt)
# Write summary about the person
def identify(unverified_person: Union[dict, UnverifiedPerson]):
"""
Finds and summarizes a person based on the provided person document.
Args:
person_doc (dict): The person document containing information about the person.
Returns:
dict: A dictionary containing the following keys:
- "unverified_person": An instance of the UnverifiedPerson class representing the unverified person.
- "found_person": An instance of the FoundPerson class representing the found person.
- "suggestions": A list of tuples containing suggestions and interrogation IDs.
"""
llm = LLM(small=True)
interrogations = person["mentioned_in_interrogation"]
if not isinstance(unverified_person, UnverifiedPerson):
unverified_person = UnverifiedPerson(unverified_person)
found_person = find_person(unverified_person)
output = []
for interrogation in interrogations:
interrogation_doc = db.collection("interrogations").get(interrogation)
text = interrogation_doc["text"]
if not found_person:
return {
"unverified_person": unverified_person,
"found_person": None,
"suggestions": [
(None, i) for i in unverified_person.doc["mentioned_in_interrogation"]
],
}
prompt = f'''I texten nedan omnämns en "{other_person}" och jag försöker förstå om det kan vara exempelvis ett felstavat namn eller smeknamn för en annan person.\n
suggestions = []
for interrogation_id in unverified_person.doc["mentioned_in_interrogation"]:
interrogation_data = db.collection("interrogations").get(interrogation_id)
text = interrogation_data["text"]
answer_prompt = f'''I texten nedan omnämns en "{unverified_person.name}" och jag försöker förstå om det kan vara exempelvis ett felstavat namn eller smeknamn för en annan person.\n
TEXT:
"""{text}"""\n
andra ställen i polisens förundersökning finns en person som heter "{found_person}", och som beskrivs här:
"""{person_in_arango_summary}"""\n
Verkar det troligt att personen som kallas {other_person} är samma person som {found_person}? Svara bara JA eller NEJ, samt en kort förklaring till varför.
andra ställen i polisens förundersökning finns en person som heter "{found_person.name}", och som beskrivs här:
"""{found_person.summary}"""\n
Verkar det troligt att personen som kallas {unverified_person.name} är samma person som {found_person.name}? Svara bara JA eller NEJ, samt en kort förklaring till varför.
'''
# Om istället förnamnet eller efternamnet är helt olika så är det förmodligen inte samma person.Om det bara är ett namn (inget efternamn) kan det också handla om ett smeknamn eller en beskrivning.
answer = llm.generate(prompt)
output.append(
(
answer,
found_person_in_arango,
interrogation_doc,
other_person,
found_person,
found_person_info,
person,
)
)
answer = llm.generate(answer_prompt)
suggestions.append((answer, interrogation_data))
return output
return {
"unverified_person": unverified_person,
"found_person": found_person,
"suggestions": suggestions,
}
def verify(
db,
answer=None,
person=None,
person_in_arango=None,
unverified_person=None,
found_person=None,
interrogation_key=None,
):
"""
@ -109,40 +275,46 @@ def verify(
print_blue("Answer:", answer)
# If the answer is Yes
if answer == "Yes":
person["mentioned_in_interrogation"].remove(interrogation_key)
person_in_arango["confirmed"] = True
db.collection("persons").update(person)
person_in_arango["info"] += person["info"]
person_in_arango["mentioned_in_interrogation"] += ["mentioned_in_interrogation"]
unverified_person.doc["mentioned_in_interrogation"].remove(interrogation_key)
db.collection("persons").update(unverified_person.doc)
from pprint import pprint
found_person.doc["confirmed"] = True
found_person.doc["info"] += found_person.doc["info"]
found_person.doc["mentioned_in_interrogation"] += ["mentioned_in_interrogation"]
print("Updated person in arango:")
pprint(
db.collection("persons").insert(person_in_arango, overwrite_mode="update")
print_green(
db.collection("persons").insert(found_person.doc, overwrite_mode="update")
)
if person["mentioned_in_interrogation"] == [] and person['_key'] != person_in_arango['_key']:
db.collection("other_persons").insert(person, overwrite=True)
db.collection("persons").delete(person, check_rev=False)
print(f"Removed {person}")
if (
unverified_person.doc["mentioned_in_interrogation"] == []
and unverified_person.doc["_key"] != found_person.doc["_key"]
):
db.collection("other_persons").insert(
unverified_person.doc, overwrite_mode="update"
)
db.collection("persons").delete(unverified_person.doc, check_rev=False)
print_red(f"Removed {unverified_person.doc}")
# If the answer is No
if answer == "No":
if "is_not" not in person:
person["is_not"] = []
if "is_not" not in unverified_person.doc:
unverified_person.doc["is_not"] = []
person["is_not"].append([person_in_arango["name"]])
db.collection("persons").update(person, merge=True, check_rev=False)
unverified_person.doc["is_not"].append([found_person.doc["name"]])
db.collection("persons").update(
unverified_person.doc, merge=True, check_rev=False
)
# If the answer is Unknown
if answer == "Unknown":
db.collection("unknown").insert(
{"name": person, "interrogation": interrogation_key}, overwrite=True
{"name": unverified_person.name, "interrogation": interrogation_key},
overwrite=True,
)
if __name__ == "__main__":
db = arango.db
persons = list(db.collection("persons").all())
q = "for doc in persons filter doc.other == true return doc"

@ -0,0 +1,34 @@
from _llm import LLM
from _arango import db
from _chroma import chroma
from print_color import *
from identify_person import find_person
llm = LLM(small=True)
def check_from(relations):
for relation in relations:
interrogation = db.collection('interrogations').get(relation['interrogation'])
text = f"Hörd person: {interrogation['person']}\n{interrogation['text']}"
prompt = f"""Är "{relation['from']}" personen som förhörs i texten nedan?\n\n{text[:2000]}\n\nSvara enbart JA eller NEJ."""
answer = llm.generate(prompt)
if 'JA' in answer:
relation['from_key'] = interrogation['person_id']
db.collection('relations').update(relation, check_rev=False)
print_rainbow(relation['from'], interrogation['person'], answer)
q = "for doc in relations filter doc.from_key == null limit 10 return doc" #! Limit 10
relations = list(db.aql.execute(q))
for relation in relations:
desc = ''
for r in relation['relations']:
desc += r['description'] + '\n'
desc = desc.strip()
print_green(relation['to'])
print(find_person(name=relation['to']))
print()

@ -0,0 +1,423 @@
from _chroma import chroma
from _arango import arango, db
from _llm import LLM
from print_color import *
import difflib
import re
from langchain_text_splitters import CharacterTextSplitter
# text_splitter = CharacterTextSplitter(
# separator="\n\n",
# chunk_size=8000,
# chunk_overlap=0,
# length_function=len,
# is_separator_regex=False,
# )
class Person:
def __init__(self):
self.info = None
self.summary = None
def make_summary(self):
llm = LLM(chat=False, system_prompt="Du sammanfattar information om en person utifrån ett polisförhör. Sammanfattningen ska sedan användas för att göra en sökning i en vektordatabas.")
info = self.info
if not self.info or all([len(self.info) < 200, 'interrogation_key' in self.doc, 'name' in self.doc]):
interrogation_text = db.collection("interrogations").get(self.doc['interrogation_key'])['text']
if len(interrogation_text) > 20000:
if self.doc['name'] in interrogation_text:
index = interrogation_text.find(self.doc['name'])
if index < 1000:
interrogation_text = interrogation_text[:8000]
else:
interrogation_text = interrogation_text[index-1000:][:8000]
prompt = f"""Nedan är ett polisförhör:\n
{interrogation_text}\n
Jag är intresserad av en person som omnämns som "{self.doc['name']}". Gör en detaljerad sammanfattning av informationen om {self.name}. Var noga med relationer, namn och platser. Svara ENBART med informationen om personen, ingenting annat. Svara alltid svenska!"""
info = llm.generate(prompt)
if self.info:
info = self.info + "\n" + info
print_rainbow(f'Info about: {self.name}', info)
summary_prompt = f""""Nedan är olika bitar med information om en person:\n
{info}\n
Sammanfatta dessa ett detaljerat sätt. Var noga med namn, platser, händelser och relationer.
Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat."""
self.summary = llm.generate(summary_prompt)
class UnknownPerson(Person):
def __init__(self, doc: dict):
super().__init__()
self.doc: dict = doc
for k, v in self.doc.items():
setattr(self, k, v)
if "info" in doc:
self.info = "\n".join(doc["info"])
else:
self.info = None
if "name" in doc:
self.name = doc["name"]
else:
self.name = ""
class FoundPerson(Person):
"""
Represents a person found in ArangoDB.
Attributes:
name (str): The name of the person.
info (str): Additional information about the person.
key (str): A unique identifier for the person.
doc (str): The persons document in ArangoDB.
summary (str): A summary of the person's details.
"""
def __init__(self, db, name, key):
super().__init__()
self.name = name
self.key = key
self.doc = db.collection("persons").get(key)
self.info = "\n".join(self.doc["info"])
class PersonIdentifier:
def __init__(
self,
doc: dict = None,
name: str = None,
key: str = None,
person: UnknownPerson = None,
interrogation_key: str=None,
text: str=None
):
self.doc: dict = doc
self.name: str = name
if 'name' in doc:
self.name = doc['name']
self.key: str = key
if '_key' in doc:
self.key = doc['_key']
self.unknown_person: UnknownPerson = None
self.found_person: FoundPerson = None
self.suggestions = None
self.interrogation_key = interrogation_key
self.text = text
self.get_unknown_person(doc, name, key, person)
def get_unknown_person(self, doc, name, key, person):
"""Get the unknown person."""
self.unknown_person = None
self.found_person = None
# Set the unknown person
if person:
self.unknown_person = person
elif doc:
self.unknown_person = UnknownPerson(doc)
elif key and db.collection("persons").get(key):
self.unknown_person = UnknownPerson(db.collection("persons").get(key))
else:
assert key or name, "Both key and name are missing."
self.unknown_person = UnknownPerson(
{k: v for k, v in [("name", name), ("_key", key)] if v}
)
def check_name(self, text):
"""Check if it's likely that person and answer_person are the same person."""
print_yellow(self.unknown_person.name, " - ", self.found_person.name)
same = False
# If person only has one name, first or last, compare that to first and last name of answer_person
if len(self.unknown_person.name.strip().split()) == 1:
llm = LLM()
answer_first_name = self.found_person.name.split()[0].strip()
answer_last_name = self.found_person.name.split()[-1].strip()
if (
difflib.SequenceMatcher(
None, self.unknown_person.name, answer_first_name
).ratio()
> 0.9
):
if answer_last_name in text:
same = True
else:
# Count how many time the first name appears in the first_names list
first_names = [
i["name"].split()[0] for i in db.collection("persons").all()
]
first_name_count = first_names.count(answer_first_name)
if first_name_count == 1:
same = True
else:
llm = LLM(small=True)
answer = llm.generate(
f'Nämns någon med efternamnet "{answer_last_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"'
)
if "JA" in answer:
same = True
elif (
difflib.SequenceMatcher(
None, self.unknown_person.name, answer_last_name
).ratio()
> 0.9
):
if answer_first_name in text:
same = True
else:
llm = LLM(small=True)
answer = llm.generate(
f'Nämns någon med förnamnet "{answer_first_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"'
)
if "JA" in answer:
same = True
else:
name_similarity = difflib.SequenceMatcher(
None, self.unknown_person.name, self.found_person.name
).ratio()
if name_similarity > 0.85:
same = True
return same
def find_with_llm(self):
if not self.unknown_person.summary:
self.unknown_person.make_summary()
llm = LLM(chat=True, system_prompt="Du hjälper till att ta reda på vad en person heter. Först skapar du meningar som ska användas för att söka i en vektordatabas, sedan använder du informationen du får där till att ta reda på vad personen heter. Svara alltid på svenska.")
print_rainbow('Info bites:', self.unknown_person.summary)
info_bites = llm.generate(f"Nedan är olika bitar med information om en person:\n\n {self.unknown_person.summary} \n\Dela upp den i 3-4 meningar där varje mening beskriver en specifik detalj om personen. Svara med en mening per rad. Svara ENBART med informationen om personen, ingenting annat.")
querys = info_bites.split("\n")
print_rainbow('Querys:', querys)
chroma_docs = chroma.query(
query_texts=querys,
n_results=3,
collection="mala_interrogations",
)
info = ''
for answer in chroma_docs['documents']:
for doc in answer:
print_blue(doc)
info += doc + "\n"
prompt = f'''Nedan är en text där {self.name} nämns:\n\n{self.text}\n\nJag vill veta vem "{self.unknown_person.name}" är. Läs texten nedan för att se om du kan hitta personens fulla namn:\n
{info}\n
Vad heter "{self.unknown_person.name}"? Svara med förnamn och efternamn formen "Förnamn Efternamn". Svara "None" om det inte går att säga utifrån informationen.'''
print_yellow('Längd på info:', len(info))
print_rainbow('Prompt', prompt)
answer = llm.generate(prompt)
print_green(answer)
def find_person(self):
"""Finds a person in the Chroma db."""
if "is_not" in self.unknown_person.doc:
list_filter_isnot = [self.unknown_person.name].append(
self.unknown_person.doc["is_not"]
)
else:
list_filter_isnot = [self.unknown_person.name]
filter_isnot = {"name": {"$nin": list_filter_isnot}}
query_results = chroma.query(
query_texts=[self.unknown_person.name],
n_results=1,
where=filter_isnot,
collection="mala_persons",
)
distance = query_results["distances"][0][0]
print_purple(query_results["metadatas"][0][0]["name"], distance)
if distance > 1: #! This is not really working...
self.unknown_person.make_summary()
query_results = chroma.query(
query_texts=[self.unknown_person.summary],
n_results=1,
where=filter_isnot,
collection="mala_persons_info",
)
distance = query_results["distances"][0][0]
print_yellow(query_results["metadatas"][0][0]["name"], distance)
if distance > 1:
return None
# return unknown_person, found_person, False
print_blue("Name found peson:", query_results["documents"][0][0])
found_person = FoundPerson(
db,
name=query_results["metadatas"][0][0]["name"],
key=query_results["metadatas"][0][0]["_key"],
)
return found_person
def identify(self):
llm = LLM(small=True)
self.found_person = self.find_person(self.unknown_person)
if not self.found_person:
self.suggestions = [
(None, i) for i in self.unknown_person.doc["mentioned_in_interrogation"]
]
# Summarize the found persons info
self.found_person.make_summary()
suggestions = []
for interrogation_id in self.unknown_person.doc["mentioned_in_interrogation"]:
interrogation_data = db.collection("interrogations").get(interrogation_id)
text = interrogation_data["text"]
answer_prompt = f'''I texten nedan omnämns en "{self.unknown_person.name}" och jag försöker förstå om det kan vara exempelvis ett felstavat namn eller smeknamn för en annan person.\n
TEXT:
"""{text}"""\n
andra ställen i polisens förundersökning finns en person som heter "{self.found_person.name}", och som beskrivs här:
"""{self.found_person.summary}"""\n
Verkar det troligt att personen som kallas {self.unknown_person.name} är samma person som {self.found_person.name}? Svara bara JA eller NEJ, samt en kort förklaring till varför.
'''
answer = llm.generate(answer_prompt)
suggestions.append((answer, interrogation_data))
self.suggestions = suggestions
def verify(
db,
answer=None,
unknown_person=None,
found_person=None,
interrogation_key=None,
):
"""
Verifies the answer for a person's identification in an interrogation.
Args:
db: The database object.
answer (str): The answer for the person's identification. Can be "Yes", "No", or "Unknown".
person (dict): The person's information.
person_in_arango (dict): The person's information in ArangoDB.
text (str): The text mentioning the person in the interrogation.
interrogation_key (str): The key identifying the interrogation.
Returns:
None
"""
print_blue("Answer:", answer)
# If the answer is Yes
if answer == "Yes":
unknown_person.doc["mentioned_in_interrogation"].remove(interrogation_key)
db.collection("persons").update(unknown_person.doc)
found_person.doc["confirmed"] = True
found_person.doc["info"] += found_person.doc["info"]
found_person.doc["mentioned_in_interrogation"] += [
"mentioned_in_interrogation"
]
print("Updated person in arango:")
print_green(
db.collection("persons").insert(
found_person.doc, overwrite_mode="update"
)
)
if (
unknown_person.doc["mentioned_in_interrogation"] == []
and unknown_person.doc["_key"] != found_person.doc["_key"]
):
db.collection("other_persons").insert(
unknown_person.doc, overwrite_mode="update"
)
db.collection("persons").delete(unknown_person.doc, check_rev=False)
print_red(f"Removed {unknown_person.doc}")
# If the answer is No
if answer == "No":
if "is_not" not in unknown_person.doc:
unknown_person.doc["is_not"] = []
unknown_person.doc["is_not"].append([found_person.doc["name"]])
db.collection("persons").update(
unknown_person.doc, merge=True, check_rev=False
)
# If the answer is Unknown
if answer == "Unknown":
db.collection("unknown").insert(
{"name": unknown_person.name, "interrogation": interrogation_key},
overwrite=True,
)
class PersonFinder:
def __init__(
self,
names={},
chunk_size=5000,
chunk_overlap=0,
separator="\n\n",
):
self.names = names
self.llm = LLM(
chat=False,
small=True,
system_prompt="Du är en assistent som hjälper till att hitta personer i ett polisförhör. Svara bara när personen finns i den del du får, hitta inte på personer.",
)
self.text_splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
is_separator_regex=False,
)
def extract_names(self, chunk, extra_prompt=""):
chunk_names = []
# Find persons in the text
prompt = f'''Jag vill hitta alla personer som nämns i texten nedan:\n
"""{chunk}"""\n
Vilka personer nämns i texten? Svara ENBART med en pythonformaterad lista av namn.
Exempel svar för att du ska förstå formen:
<exempel>
[namn1, namn2, namn3].
</exempel
Var noga med att svara
{extra_prompt}'''
response = self.llm.generate(prompt)
response = re.sub(r"[^a-zA-ZåäöÅÄÖ\- ,]", "", response).replace(" namn ", "")
for name in [i.strip() for i in response.split(",") if len(i) > 2]:
same_name = False
if name not in chunk_names and name not in self.names:
if self.names != []:
for n in list(self.names):
if name in n:
same_name = True
self.names[name] = self.names[n]
if not same_name:
chunk_names.append(name)
return chunk_names
if __name__ == "__main__":
text = db.collection('rumors').get('Mikael_Sjostrom_2023-02-13_p.98')
person = PersonIdentifier(
doc={'name': 'Douglas', 'interrogation_key': "_'Larsson',_'_Neo'__2023-02-15_p.208"})
person.find_with_llm()

@ -1,11 +0,0 @@
from _arango import arango
from _llm import LLM
llm = LLM(keep_alive=6000, chat=False)
q = 'for doc in interrogations filter doc.reason != null return doc'
docs = [i for i in arango.db.aql.execute(q)]
for doc in docs:
print("\033[92m", doc['person'], "\033[0m", doc['reason'])

@ -1,23 +1,58 @@
from random import choice
def print_green(*args):
text = ''
text = ""
for arg in args:
text += str(arg) + ' '
text += str(arg) + " "
print(f"\033[92m{text}\033[0m")
def print_red(*args):
text = ''
text = ""
for arg in args:
text += str(arg) + ' '
text += str(arg) + " "
print(f"\033[91m{text}\033[0m")
def print_yellow(*args):
text = ''
text = ""
for arg in args:
text += str(arg) + ' '
text += str(arg) + " "
print(f"\033[93m{text}\033[0m")
def print_blue(*args):
text = ''
text = ""
for arg in args:
text += str(arg) + " "
print(f"\033[94m{text}\033[0m")
def print_purple(*args):
text = ""
for arg in args:
text += str(arg) + " "
print(f"\033[95m{text}\033[0m")
def choose_color(last_color_index):
colors = {
"blue": "\033[94m",
"green": "\033[92m",
"yellow": "\033[93m",
"red": "\033[91m",
"purple": "\033[95m",
}
color_keys = list(colors.keys())
color_index = (last_color_index + 1) % len(color_keys)
color = color_keys[color_index]
return colors[color], color, color_index
def print_rainbow(*args):
color_index = -1
text = ""
for arg in args:
text += str(arg) + ' '
print(f"\033[94m{text}\033[0m")
color_code, color, color_index = choose_color(color_index)
text += f"{color_code}{arg}\033[0m "
print(text)

@ -0,0 +1,15 @@
from _llm import LLM
from _arango import arango
from print_color import *
llm = LLM(chat=False)
interrogations = list(arango.db.collection("interrogations").all())
for interrogation in interrogations:
text = interrogation['text']
prompt = f'Vad sägs om lördagskvällen i texten nedan? \n\n"""{text}""" Jag vill veta vad som sägs i texten om lördagskvällen. Var noga med prsonre, namn och platser.'
answer = llm.generate(prompt)
print_blue(interrogation['person'])
print(answer, '\n')

@ -0,0 +1,94 @@
from _chroma import ChromaDB
from _openai import LLM_OpenAI as LLM
import streamlit as st
from print_color import *
def get_docs(user_input):
docs = chroma.query('mala_interrogations', user_input, n_results=5)
return docs
def generate_prompt(user_input, docs):
texts = [text for text in docs['documents'][0]]
metas = [{'person': meta['person'], 'date': meta['date']} for meta in docs['metadatas'][0]]
combined_data = list(zip(texts, metas))
string = ''
for text, meta in combined_data:
# Do something with text and meta
# For example, print them
string += f'\n\nFrån förhör med {meta["person"]} {meta["date"]}:'.upper()
string += f'\n{text}\n\n'
prompt = f'''Svara på frågan: {user_input}\n
Använd endast informationen nedan:\n
{string}\n
Skriv utförligt svenska och var noga med detaljer som namn, plats och datum.
Får gärna med information från alla fem förhör om det är relevant.\n
{user_input}'''
answer = st.session_state.llm.generate(prompt)
return prompt
st.set_page_config(
page_title="Malå",
)
# Should not be reseted every run.
if "llm" not in st.session_state:
st.session_state.llm = LLM(chat=True, system_prompt='Du är assistent åt en journalist som går igenom förhör i en förundersökning. Svara bara utifrån den information du får. Svara alltid på svenska!')
# Should be reset every run.
llm_checker = LLM(chat=True)
chroma = ChromaDB()
# Initialize chat history
if "messages" not in st.session_state:
st.session_state.messages = []
# Display chat messages from history on app rerun
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
# Accept user input
if user_input := st.chat_input("Fråga något om förhören."):
print_blue(user_input)
if len(st.session_state.messages) > 1:
history = ''
for message in st.session_state.messages:
history += f"{message['role']}: {message['content']}\n'"
prompt = f'En användare har ställt frågan "{user_input}" och här är chatthistoriken mellan användaren och en assistent:\n{history}\n\nVerkar "{user_input}" vara en uppföljningfråga eller en fristående fråga? Svara ENDAST med "uppföljning" eller "fristående".'
chat_completion = llm_checker.generate(prompt, stream=False)
answer = chat_completion.choices[0].message.content
print_red(answer)
if 'uppföljning' in answer:
prompt=f'Använd historiken till att omformulera "{user_input}" till en helt fristående fråga. Frågan ska användas för att hitta information i förhören.'
chat_completion = llm_checker.generate(prompt, stream=False)
question2chroma = chat_completion.choices[0].message.content
if 'fristående' in answer:
question2chroma = user_input
if 'None' in answer:
question2chroma = user_input
print_yellow(question2chroma)
else:
question2chroma = user_input
# Add user message to chat history
st.session_state.messages.append({"role": "user", "content": user_input})
# Display user message in chat message container
with st.chat_message("user"):
st.markdown(user_input)
# Display assistant response in chat message container
with st.chat_message("assistant"):
docs = get_docs(question2chroma)
prompt = generate_prompt(user_input, docs)
stream = st.session_state.llm.generate(prompt)
response = st.write_stream(stream)
st.session_state.llm.messages.append({'role': 'assistant', 'content': response})
st.session_state.messages.append({"role": "assistant", "content": response})
print()
Loading…
Cancel
Save