parent
b5ad23f652
commit
744b1f02f2
21 changed files with 1971 additions and 381 deletions
@ -0,0 +1,335 @@ |
||||
import streamlit as st |
||||
from identify_person import identify, verify, find_person, UnverifiedPerson, FoundPerson |
||||
from _arango import arango |
||||
import re |
||||
from fuzzywuzzy import process |
||||
from _llm import LLM as LLM_garda |
||||
from _openai import LLM_OpenAI as LLM |
||||
from print_color import * |
||||
from random import randint |
||||
|
||||
|
||||
# from print_color import * |
||||
print("Start") |
||||
|
||||
|
||||
def reset_choices(): |
||||
st.session_state.user_choice = None |
||||
st.session_state.unconfirmed_choice = None |
||||
st.session_state.custom_choice = None |
||||
|
||||
|
||||
def check_if_dict_in_list(target_dict, list_of_dicts): |
||||
|
||||
target_key, target_value = list(target_dict.items())[0] |
||||
for dict_ in list_of_dicts: |
||||
key, value = list(dict_.items())[0] |
||||
if key == target_key and value == target_value: |
||||
return True |
||||
return False |
||||
|
||||
|
||||
def submitted(): |
||||
st.session_state.next = True |
||||
|
||||
|
||||
@st.cache_data() |
||||
def sort_names_by_similarity(target_name, name_list): |
||||
# Get a list of tuples (name, score) |
||||
scored_names = process.extract(target_name, name_list, limit=len(name_list)) |
||||
|
||||
# Sort the list of tuples by score in descending order |
||||
scored_names.sort(key=lambda x: x[1], reverse=True) |
||||
|
||||
# Extract the sorted list of names |
||||
sorted_names = [name for name, score in scored_names] |
||||
|
||||
return sorted_names |
||||
|
||||
|
||||
@st.cache_data() |
||||
def get_persons(): |
||||
return list(arango.db.collection("persons").all()) |
||||
|
||||
|
||||
@st.cache_data() |
||||
def get_unverified_persons(): |
||||
q = "for doc in persons filter doc.confirmed != true return doc" |
||||
return [i for i in db.aql.execute(q)] |
||||
|
||||
|
||||
@st.cache_data() |
||||
def get_suggestions(person): |
||||
|
||||
suggestion = identify(person) |
||||
st.session_state.unverified_person = suggestion["unverified_person"] |
||||
st.session_state.found_person = suggestion["found_person"] |
||||
st.session_state.suggestions = suggestion["suggestions"] |
||||
|
||||
|
||||
def caps(string): |
||||
if "*" not in string: |
||||
string = string.upper() |
||||
else: |
||||
string = string.replace("*", "") |
||||
return string |
||||
|
||||
|
||||
def get_unverified_person(): |
||||
try: |
||||
n = randint(0, len(st.session_state.unverified_persons) - 1) |
||||
doc = st.session_state.unverified_persons.pop(n) |
||||
st.session_state.unverified_person = UnverifiedPerson(doc) |
||||
except ValueError: |
||||
st.markdown(":green[Inga fler personer att verifiera.]") |
||||
st.stop() |
||||
|
||||
|
||||
st.set_page_config( |
||||
page_title="Malå", |
||||
) |
||||
|
||||
# Get URL parameters |
||||
params = st.query_params |
||||
param_person_key = params.get("person_key", None) |
||||
|
||||
db = arango.db |
||||
|
||||
# Add a session state to store the persons and unconfirmed persons etc |
||||
if "next" not in st.session_state: |
||||
st.session_state.next = False |
||||
if "persons" not in st.session_state: |
||||
st.session_state.persons = get_persons() |
||||
|
||||
all_persons_name_list = [] |
||||
for person in st.session_state.persons: |
||||
name = person["name"] |
||||
if not person["confirmed"]: |
||||
name += "*" |
||||
all_persons_name_list.append(name) |
||||
st.session_state.persons_names = all_persons_name_list |
||||
st.session_state.persons_dict = { |
||||
i["name"]: i["_key"] for i in st.session_state.persons |
||||
} |
||||
|
||||
if "unverified_persons" not in st.session_state: |
||||
if param_person_key: |
||||
# If a person key is provided in the URL, only show that person |
||||
st.session_state.unverified_persons = list( |
||||
db.aql.execute( |
||||
"for doc in persons filter doc._key == @key return doc", |
||||
bind_vars={"key": param_person_key}, |
||||
) |
||||
) |
||||
print_blue("param_person_key".upper(), st.session_state.unverified_persons) |
||||
else: |
||||
st.session_state.unverified_persons = get_unverified_persons() |
||||
|
||||
if "persons_names" not in st.session_state: |
||||
st.session_state.persons_names = arango.get_persons(confirmed=False)["names"] |
||||
|
||||
if "user_choice" not in st.session_state: |
||||
st.session_state.user_choice = None |
||||
if "unconfirmed_choice" not in st.session_state: |
||||
st.session_state.unconfirmed_choice = None |
||||
if "custom_choice" not in st.session_state: |
||||
st.session_state.custom_choice = None |
||||
|
||||
if ( |
||||
"unverified_person" not in st.session_state |
||||
or not st.session_state.unverified_persons |
||||
): |
||||
get_unverified_person() |
||||
|
||||
if "found_person" not in st.session_state: |
||||
st.session_state.found_person = None |
||||
if "suggestions" not in st.session_state: |
||||
# Set new values for unverified_person, found_person and suggestions (as session_state) |
||||
get_suggestions(st.session_state.unverified_person.__dict__) |
||||
print_yellow("SUGGESTIONS", st.session_state.suggestions) |
||||
|
||||
if "suggestion" not in st.session_state: |
||||
st.session_state.suggestion = st.session_state.suggestions.pop(0) |
||||
|
||||
|
||||
# Get unconfirmed person, found person and answer from the suggestions |
||||
unverified_person: UnverifiedPerson = st.session_state.unverified_person |
||||
found_person: FoundPerson = st.session_state.found_person |
||||
|
||||
answer = st.session_state.suggestion[0] |
||||
interrogation_doc = st.session_state.suggestion[1] |
||||
if isinstance(interrogation_doc, str): |
||||
interrogation_doc = db.collection("interrogations").get(interrogation_doc) |
||||
text = interrogation_doc["text"] |
||||
|
||||
|
||||
st.markdown( |
||||
f'Namnet **"{st.session_state.unverified_person.name}"** används i **{len(st.session_state.unverified_person.mentioned_in_interrogation)}** förhör. Namnet kan syfta på olika personer i olika sammanhang så vi går igenom förhören ett och ett.' |
||||
) |
||||
|
||||
if answer: |
||||
answer = answer.replace("\n", " ") |
||||
st.markdown( |
||||
f"Är :blue[{unverified_person.name}] samma som :blue[{found_person.name}]?" |
||||
) |
||||
print(found_person.__dict__) |
||||
st.write(f'(från förhör med {interrogation_doc["person"]})') |
||||
if "JA" in answer: |
||||
st.markdown(f"🤖\n:green[{answer.replace('JA ', '')}]") |
||||
radio_index = 0 |
||||
elif "NEJ" in answer: |
||||
radio_index = 1 |
||||
st.markdown(f"🤖\n:red[{answer.replace('NEJ ', '')}]") |
||||
else: |
||||
radio_index = None |
||||
st.markdown(f"🤖\n{answer}") |
||||
|
||||
# Let the user expand for more info |
||||
|
||||
else: |
||||
st.markdown(f"Vem är :blue[{unverified_person.name}]?") |
||||
st.write(f'(från förhör med {interrogation_doc["person"]})') |
||||
# Show the information about the suggested person |
||||
|
||||
|
||||
# Edit and show the interrogation text |
||||
with st.expander(f"Mer information om förhöret"): |
||||
text = text.replace("\nFL:", "<br>**FL:** ").replace("\nDH:", "<br>**DH:** ") |
||||
text = re.sub(r"\n(?!\n)", "", text) |
||||
text = re.sub(r"\n\n+", "\n", text) |
||||
text = text.replace("\n", "<br>").replace( |
||||
unverified_person.name, f"**:red[{unverified_person.name}]**" |
||||
) |
||||
st.markdown(f"##### Förhöret:\n{text}", unsafe_allow_html=True) |
||||
|
||||
# A form to let the user select an alternative |
||||
with st.form("select_alternative"): |
||||
|
||||
if answer: |
||||
# Let the user decide if the suggested person is the same as the unconfirmed person |
||||
st.session_state.user_choice = st.radio( |
||||
"Select alternative", |
||||
("Ja", "Nej", "Vet ej"), |
||||
key="user_choice_radio", |
||||
index=radio_index, |
||||
) |
||||
|
||||
else: |
||||
st.session_state.user_choice = None |
||||
|
||||
# Let the user select an alternative person |
||||
alternatives = sort_names_by_similarity( |
||||
unverified_person.name, st.session_state.persons_names |
||||
) |
||||
|
||||
st.session_state.unconfirmed_choice = st.selectbox( |
||||
"Välj någon som stämmer", |
||||
alternatives, |
||||
placeholder="Sök en annan", |
||||
index=None, |
||||
key="multiselect", |
||||
format_func=caps, |
||||
help="Personer i caps är bekräftade personer, välj någon av dem om det verkar stämma.", |
||||
) |
||||
|
||||
# Let the user enter a custom alternative |
||||
st.session_state.custom_choice = st.text_input( |
||||
"Annan person", |
||||
key="custom", |
||||
help="Skriv in namnet på personen om det inte finns i listan. Var noga med stavningen.", |
||||
) |
||||
st.session_state.custom_choice = ( |
||||
None if st.session_state.custom_choice == "" else st.session_state.custom_choice |
||||
) |
||||
|
||||
# If the user has made a selection |
||||
st.session_state.next = st.form_submit_button("Nästa", on_click=submitted) |
||||
|
||||
if st.session_state.next: |
||||
if st.session_state.custom_choice: |
||||
print("CUSTOM CHOICE", st.session_state.custom_choice) |
||||
llm = LLM() |
||||
info = llm.generate( |
||||
f'Nedan är ett polisförhör där en person omnämns som "{unverified_person.name}".\n\n{interrogation_doc["text"]}\n\nSammanfatta informationen om {unverified_person.name} på ett detaljerat sätt, var noga med namn, platser, händelser och relationer. Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat. ' |
||||
) |
||||
person_in_arango = db.collection("persons").insert( |
||||
{ |
||||
"_key": arango.fix_key_name(st.session_state.custom_choice), |
||||
"name": st.session_state.custom_choice, |
||||
"info": [info], |
||||
"mentioned_in_interrogation": [interrogation_doc["_key"]], |
||||
"mentioned_as": [{unverified_person.name: interrogation_doc["_key"]}], |
||||
} |
||||
) |
||||
verify( |
||||
db, |
||||
"Yes", |
||||
unverified_person.doc, |
||||
found_person.doc, |
||||
interrogation_key=interrogation_doc["_key"], |
||||
) |
||||
|
||||
elif st.session_state.unconfirmed_choice: |
||||
|
||||
unconfirmed_choice = st.session_state.unconfirmed_choice.replace( |
||||
"*", "" |
||||
).strip() |
||||
|
||||
print_yellow("OTHER CHOICE", unconfirmed_choice) |
||||
doc = db.collection("persons").get( |
||||
st.session_state.persons_dict[unconfirmed_choice] |
||||
) |
||||
found_person = FoundPerson( |
||||
db, unconfirmed_choice, st.session_state.persons_dict[unconfirmed_choice] |
||||
) |
||||
print("NEW:", found_person.name) |
||||
|
||||
verify(db, "Yes", unverified_person, found_person, interrogation_doc["_key"]) |
||||
|
||||
elif st.session_state.user_choice == "Ja": |
||||
print("USER CHOICE", st.session_state.user_choice) |
||||
if "mentioned_as" not in found_person.doc: |
||||
found_person.doc["mentioned_as"] = [] |
||||
if not check_if_dict_in_list( |
||||
{person["name"]: interrogation_doc["_key"]}, |
||||
found_person.doc["mentioned_as"], |
||||
): |
||||
found_person.doc["mentioned_as"].append( |
||||
{person["name"]: interrogation_doc["_key"]} |
||||
) |
||||
verify( |
||||
db, |
||||
answer="Yes", |
||||
person=person, |
||||
person_in_arango=found_person.doc, |
||||
interrogation_key=interrogation_doc["_key"], |
||||
) |
||||
elif st.session_state.user_choice == "Nej": |
||||
verify( |
||||
db, |
||||
"No", |
||||
unverified_person=unverified_person.doc, |
||||
found_person=found_person.doc, |
||||
interrogation_key=interrogation_doc["_key"], |
||||
) |
||||
|
||||
elif st.session_state.user_choice == "Vet ej": |
||||
verify( |
||||
db, |
||||
"Unknown", |
||||
unverified_person=unverified_person.doc, |
||||
interrogation_key=interrogation_doc["_key"], |
||||
) |
||||
|
||||
reset_choices() |
||||
if not param_person_key: |
||||
if st.session_state.suggestions != []: |
||||
st.session_state.suggestion = st.session_state.suggestions.pop(0) |
||||
else: |
||||
get_unverified_person() |
||||
get_suggestions(st.session_state.unverified_person.__dict__) |
||||
st.session_state.suggestion = st.session_state.suggestions.pop(0) |
||||
st.rerun() |
||||
else: |
||||
st.markdown(":green[Tack!] Du kan stäna de här fliken nu.") |
||||
st.stop() |
||||
@ -0,0 +1,76 @@ |
||||
from openai import OpenAI, RateLimitError |
||||
from dotenv import load_dotenv |
||||
import os |
||||
from _llm import LLM as LLM_ollama |
||||
from print_color import * |
||||
from time import sleep |
||||
load_dotenv() |
||||
|
||||
class LLM_OpenAI: |
||||
def __init__( |
||||
self, |
||||
system_prompt='Svara alltid på svenska. Svara bara på det som efterfrågas. Om du inte kan svara, skriv "Jag vet inte".', |
||||
chat=False, |
||||
model="gpt-3.5-turbo-0125", |
||||
max_tokens=24000, |
||||
sleep_time=0 |
||||
): |
||||
self.chat = chat |
||||
self.model = model |
||||
self.temperature=0 |
||||
self.max_tokens = max_tokens |
||||
self.system_message = {"role": "system", "content": system_prompt} |
||||
self.messages =[self.system_message] |
||||
self.client = OpenAI( |
||||
# This is the default and can be omitted |
||||
api_key=os.getenv("OPEN_AI"), |
||||
) |
||||
self.llm_ollama = LLM_ollama(chat=False, stream=True) # For backup |
||||
self.sleep_time = sleep_time |
||||
|
||||
def build_message(self, message): |
||||
# Add the new message to the list |
||||
self.messages.append({"role": "user", "content": message}) |
||||
|
||||
# Calculate the total token length of the messages |
||||
total_tokens = sum([len((msg["content"])) for msg in self.messages]) |
||||
|
||||
# While the total token length exceeds the limit, remove the oldest messages |
||||
while total_tokens > self.max_tokens: |
||||
removed_message = self.messages.pop( |
||||
1 |
||||
) # Remove the oldest message (not the system message) |
||||
total_tokens -= len((removed_message["content"])) |
||||
|
||||
def generate(self, prompt, stream=False, local=False): |
||||
sleep(self.sleep_time) |
||||
if self.chat: |
||||
self.build_message(prompt) |
||||
messages = self.messages |
||||
else: |
||||
messages = [self.system_message, {"role": "user", "content": prompt}] |
||||
print(sum([len((msg["content"])) for msg in messages])) |
||||
|
||||
if local: |
||||
response = self.llm_ollama.generate_stream(prompt) |
||||
|
||||
else: |
||||
try: |
||||
response = self.client.chat.completions.create( |
||||
messages=messages, |
||||
model=self.model, |
||||
stream=stream |
||||
) |
||||
except RateLimitError as e: |
||||
print_red(e) |
||||
response = self.llm_ollama.generate_stream(prompt) |
||||
|
||||
if stream: |
||||
return response |
||||
else: |
||||
answer = response.choices[0].message.content |
||||
if self.chat: |
||||
self.messages.append({"role": "assistant", "content": answer}) |
||||
return answer |
||||
|
||||
|
||||
@ -0,0 +1,43 @@ |
||||
from _arango import db |
||||
from _llm import LLM |
||||
from langchain_text_splitters import CharacterTextSplitter |
||||
from print_color import * |
||||
interrogations = list(db.aql.execute('for doc in interrogations filter doc.formatted_text == null return doc', count=True)) |
||||
|
||||
|
||||
text_splitter = CharacterTextSplitter( |
||||
separator="\n\n", |
||||
chunk_size=2000, |
||||
chunk_overlap=0, |
||||
length_function=len, |
||||
is_separator_regex=False, |
||||
) |
||||
|
||||
|
||||
for interrogation in interrogations: |
||||
text = interrogation['text'] |
||||
|
||||
chunks = text_splitter.split_text(text) |
||||
formated_chunks = [] |
||||
for chunk in chunks: |
||||
print_yellow(len(chunk)) |
||||
|
||||
llm = LLM(chat=False, system_prompt='Du formaterar text enligt med markdown för att göra den lättare att läsa. Använd inte rubriker, bara fet och stil. Om det förekommer en dialog fetmarkera den som talar, exempelvis ** DH: **. Namn ska göras fetade, även om det bara är ett förnamn. Svara alltid med EXAKT samma text som du fick, men formaterad. Svara alltid på svenska.') |
||||
if 'name' not in interrogation: |
||||
interrogation['name'] = interrogation['person'] |
||||
name = interrogation['name'] |
||||
|
||||
prompt = f'''Kolla på texten nedan: \n\n\n{chunk}\n\n\nJag vill att du svarar med EXAKT samma text, men formaterad enligt markdown för att vara enklare att läsa. Formatera enligt följande: |
||||
- Använd aldrig rubriker (#) |
||||
- Om det är en längre dialog mellan förhörsledare (FL) och den hörde (DH) så formatera dem med fetstil, exempelvis **DH: **. |
||||
- Gör namn på personer fetade, även om det bara är ett förnamn. Den förhörde – {name} – ska inte vara fetad utan normal text. |
||||
Ibland är styckeindelningen inte korrekt, försök att göra det lättare att läsa. |
||||
Svara ENBART med den formaterade texten, ingenting annat.''' |
||||
formatted_chunk = llm.generate(prompt) |
||||
print_blue(formatted_chunk) |
||||
formated_chunks.append(formatted_chunk) |
||||
formatted_text = '\n '.join(formated_chunks) |
||||
interrogation['formatted_text'] = formatted_text |
||||
db.collection('interrogations').update(interrogation, check_rev=False) |
||||
|
||||
|
||||
@ -0,0 +1,26 @@ |
||||
from _arango import db |
||||
from _llm import LLM |
||||
from print_color import * |
||||
relations = list(db.aql.execute('for doc in relations return doc', count=True)) |
||||
|
||||
for relation in relations: |
||||
interrogation = db.collection('interrogations').get(relation['interrogation']) |
||||
if not interrogation: |
||||
print_red(relation) |
||||
continue |
||||
for i in ['to', 'from']: |
||||
if 'name' not in interrogation: |
||||
interrogation['name'] = interrogation['person'] |
||||
db.collection('interrogations').update(interrogation, check_rev=False) |
||||
if relation[i] == interrogation['name']: |
||||
relation[i] = interrogation['person_id'] |
||||
elif relation[i] == interrogation['person_mentioned_as']: |
||||
relation[i] = interrogation['person_id'] |
||||
for k, v in relation.items(): |
||||
print_rainbow(k, v) |
||||
print() |
||||
for k, v in interrogation.items(): |
||||
if k == 'text' or k == 'formatted_text': |
||||
continue |
||||
print_rainbow(k, v) |
||||
db.collection('relations').update(relation, check_rev=False) |
||||
@ -0,0 +1,34 @@ |
||||
from _llm import LLM |
||||
from _arango import db |
||||
from _chroma import chroma |
||||
from print_color import * |
||||
from identify_person import find_person |
||||
|
||||
|
||||
llm = LLM(small=True) |
||||
|
||||
|
||||
def check_from(relations): |
||||
|
||||
for relation in relations: |
||||
interrogation = db.collection('interrogations').get(relation['interrogation']) |
||||
text = f"Hörd person: {interrogation['person']}\n{interrogation['text']}" |
||||
prompt = f"""Är "{relation['from']}" personen som förhörs i texten nedan?\n\n{text[:2000]}\n\nSvara enbart JA eller NEJ.""" |
||||
answer = llm.generate(prompt) |
||||
if 'JA' in answer: |
||||
relation['from_key'] = interrogation['person_id'] |
||||
db.collection('relations').update(relation, check_rev=False) |
||||
print_rainbow(relation['from'], interrogation['person'], answer) |
||||
|
||||
|
||||
q = "for doc in relations filter doc.from_key == null limit 10 return doc" #! Limit 10 |
||||
relations = list(db.aql.execute(q)) |
||||
|
||||
for relation in relations: |
||||
desc = '' |
||||
for r in relation['relations']: |
||||
desc += r['description'] + '\n' |
||||
desc = desc.strip() |
||||
print_green(relation['to']) |
||||
print(find_person(name=relation['to'])) |
||||
print() |
||||
@ -0,0 +1,423 @@ |
||||
from _chroma import chroma |
||||
from _arango import arango, db |
||||
from _llm import LLM |
||||
from print_color import * |
||||
import difflib |
||||
import re |
||||
from langchain_text_splitters import CharacterTextSplitter |
||||
|
||||
# text_splitter = CharacterTextSplitter( |
||||
# separator="\n\n", |
||||
# chunk_size=8000, |
||||
# chunk_overlap=0, |
||||
# length_function=len, |
||||
# is_separator_regex=False, |
||||
# ) |
||||
|
||||
|
||||
class Person: |
||||
def __init__(self): |
||||
self.info = None |
||||
self.summary = None |
||||
|
||||
def make_summary(self): |
||||
llm = LLM(chat=False, system_prompt="Du sammanfattar information om en person utifrån ett polisförhör. Sammanfattningen ska sedan användas för att göra en sökning i en vektordatabas.") |
||||
info = self.info |
||||
if not self.info or all([len(self.info) < 200, 'interrogation_key' in self.doc, 'name' in self.doc]): |
||||
interrogation_text = db.collection("interrogations").get(self.doc['interrogation_key'])['text'] |
||||
if len(interrogation_text) > 20000: |
||||
if self.doc['name'] in interrogation_text: |
||||
index = interrogation_text.find(self.doc['name']) |
||||
if index < 1000: |
||||
interrogation_text = interrogation_text[:8000] |
||||
else: |
||||
interrogation_text = interrogation_text[index-1000:][:8000] |
||||
|
||||
prompt = f"""Nedan är ett polisförhör:\n |
||||
{interrogation_text}\n |
||||
Jag är intresserad av en person som omnämns som "{self.doc['name']}". Gör en detaljerad sammanfattning av informationen om {self.name}. Var noga med relationer, namn och platser. Svara ENBART med informationen om personen, ingenting annat. Svara alltid på svenska!""" |
||||
info = llm.generate(prompt) |
||||
if self.info: |
||||
info = self.info + "\n" + info |
||||
print_rainbow(f'Info about: {self.name}', info) |
||||
summary_prompt = f""""Nedan är olika bitar med information om en person:\n |
||||
{info}\n |
||||
Sammanfatta dessa på ett detaljerat sätt. Var noga med namn, platser, händelser och relationer. |
||||
Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat.""" |
||||
self.summary = llm.generate(summary_prompt) |
||||
|
||||
|
||||
class UnknownPerson(Person): |
||||
def __init__(self, doc: dict): |
||||
super().__init__() |
||||
self.doc: dict = doc |
||||
for k, v in self.doc.items(): |
||||
setattr(self, k, v) |
||||
if "info" in doc: |
||||
self.info = "\n".join(doc["info"]) |
||||
else: |
||||
self.info = None |
||||
if "name" in doc: |
||||
self.name = doc["name"] |
||||
else: |
||||
self.name = "" |
||||
|
||||
|
||||
class FoundPerson(Person): |
||||
""" |
||||
Represents a person found in ArangoDB. |
||||
|
||||
Attributes: |
||||
name (str): The name of the person. |
||||
info (str): Additional information about the person. |
||||
key (str): A unique identifier for the person. |
||||
doc (str): The persons document in ArangoDB. |
||||
summary (str): A summary of the person's details. |
||||
""" |
||||
|
||||
def __init__(self, db, name, key): |
||||
super().__init__() |
||||
self.name = name |
||||
self.key = key |
||||
self.doc = db.collection("persons").get(key) |
||||
self.info = "\n".join(self.doc["info"]) |
||||
|
||||
|
||||
class PersonIdentifier: |
||||
def __init__( |
||||
self, |
||||
doc: dict = None, |
||||
name: str = None, |
||||
key: str = None, |
||||
person: UnknownPerson = None, |
||||
interrogation_key: str=None, |
||||
text: str=None |
||||
): |
||||
self.doc: dict = doc |
||||
self.name: str = name |
||||
if 'name' in doc: |
||||
self.name = doc['name'] |
||||
self.key: str = key |
||||
if '_key' in doc: |
||||
self.key = doc['_key'] |
||||
self.unknown_person: UnknownPerson = None |
||||
self.found_person: FoundPerson = None |
||||
self.suggestions = None |
||||
self.interrogation_key = interrogation_key |
||||
self.text = text |
||||
|
||||
self.get_unknown_person(doc, name, key, person) |
||||
|
||||
def get_unknown_person(self, doc, name, key, person): |
||||
"""Get the unknown person.""" |
||||
self.unknown_person = None |
||||
self.found_person = None |
||||
|
||||
# Set the unknown person |
||||
if person: |
||||
self.unknown_person = person |
||||
elif doc: |
||||
self.unknown_person = UnknownPerson(doc) |
||||
elif key and db.collection("persons").get(key): |
||||
self.unknown_person = UnknownPerson(db.collection("persons").get(key)) |
||||
else: |
||||
assert key or name, "Both key and name are missing." |
||||
self.unknown_person = UnknownPerson( |
||||
{k: v for k, v in [("name", name), ("_key", key)] if v} |
||||
) |
||||
|
||||
def check_name(self, text): |
||||
"""Check if it's likely that person and answer_person are the same person.""" |
||||
print_yellow(self.unknown_person.name, " - ", self.found_person.name) |
||||
same = False |
||||
|
||||
# If person only has one name, first or last, compare that to first and last name of answer_person |
||||
if len(self.unknown_person.name.strip().split()) == 1: |
||||
llm = LLM() |
||||
answer_first_name = self.found_person.name.split()[0].strip() |
||||
answer_last_name = self.found_person.name.split()[-1].strip() |
||||
|
||||
if ( |
||||
difflib.SequenceMatcher( |
||||
None, self.unknown_person.name, answer_first_name |
||||
).ratio() |
||||
> 0.9 |
||||
): |
||||
if answer_last_name in text: |
||||
same = True |
||||
else: |
||||
# Count how many time the first name appears in the first_names list |
||||
first_names = [ |
||||
i["name"].split()[0] for i in db.collection("persons").all() |
||||
] |
||||
first_name_count = first_names.count(answer_first_name) |
||||
|
||||
if first_name_count == 1: |
||||
same = True |
||||
else: |
||||
llm = LLM(small=True) |
||||
answer = llm.generate( |
||||
f'Nämns någon med efternamnet "{answer_last_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"' |
||||
) |
||||
if "JA" in answer: |
||||
same = True |
||||
|
||||
elif ( |
||||
difflib.SequenceMatcher( |
||||
None, self.unknown_person.name, answer_last_name |
||||
).ratio() |
||||
> 0.9 |
||||
): |
||||
if answer_first_name in text: |
||||
same = True |
||||
else: |
||||
llm = LLM(small=True) |
||||
answer = llm.generate( |
||||
f'Nämns någon med förnamnet "{answer_first_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"' |
||||
) |
||||
if "JA" in answer: |
||||
same = True |
||||
|
||||
else: |
||||
name_similarity = difflib.SequenceMatcher( |
||||
None, self.unknown_person.name, self.found_person.name |
||||
).ratio() |
||||
|
||||
if name_similarity > 0.85: |
||||
same = True |
||||
|
||||
return same |
||||
|
||||
def find_with_llm(self): |
||||
if not self.unknown_person.summary: |
||||
self.unknown_person.make_summary() |
||||
llm = LLM(chat=True, system_prompt="Du hjälper till att ta reda på vad en person heter. Först skapar du meningar som ska användas för att söka i en vektordatabas, sedan använder du informationen du får där till att ta reda på vad personen heter. Svara alltid på svenska.") |
||||
print_rainbow('Info bites:', self.unknown_person.summary) |
||||
info_bites = llm.generate(f"Nedan är olika bitar med information om en person:\n\n {self.unknown_person.summary} \n\Dela upp den i 3-4 meningar där varje mening beskriver en specifik detalj om personen. Svara med en mening per rad. Svara ENBART med informationen om personen, ingenting annat.") |
||||
querys = info_bites.split("\n") |
||||
print_rainbow('Querys:', querys) |
||||
chroma_docs = chroma.query( |
||||
query_texts=querys, |
||||
n_results=3, |
||||
collection="mala_interrogations", |
||||
) |
||||
info = '' |
||||
for answer in chroma_docs['documents']: |
||||
for doc in answer: |
||||
print_blue(doc) |
||||
info += doc + "\n" |
||||
|
||||
prompt = f'''Nedan är en text där {self.name} nämns:\n\n{self.text}\n\nJag vill veta vem "{self.unknown_person.name}" är. Läs texten nedan för att se om du kan hitta personens fulla namn:\n |
||||
{info}\n |
||||
Vad heter "{self.unknown_person.name}"? Svara med förnamn och efternamn på formen "Förnamn Efternamn". Svara "None" om det inte går att säga utifrån informationen.''' |
||||
print_yellow('Längd på info:', len(info)) |
||||
print_rainbow('Prompt', prompt) |
||||
answer = llm.generate(prompt) |
||||
print_green(answer) |
||||
|
||||
|
||||
def find_person(self): |
||||
"""Finds a person in the Chroma db.""" |
||||
|
||||
if "is_not" in self.unknown_person.doc: |
||||
list_filter_isnot = [self.unknown_person.name].append( |
||||
self.unknown_person.doc["is_not"] |
||||
) |
||||
else: |
||||
list_filter_isnot = [self.unknown_person.name] |
||||
|
||||
filter_isnot = {"name": {"$nin": list_filter_isnot}} |
||||
|
||||
query_results = chroma.query( |
||||
query_texts=[self.unknown_person.name], |
||||
n_results=1, |
||||
where=filter_isnot, |
||||
collection="mala_persons", |
||||
) |
||||
|
||||
distance = query_results["distances"][0][0] |
||||
print_purple(query_results["metadatas"][0][0]["name"], distance) |
||||
|
||||
if distance > 1: #! This is not really working... |
||||
self.unknown_person.make_summary() |
||||
query_results = chroma.query( |
||||
query_texts=[self.unknown_person.summary], |
||||
n_results=1, |
||||
where=filter_isnot, |
||||
collection="mala_persons_info", |
||||
) |
||||
distance = query_results["distances"][0][0] |
||||
print_yellow(query_results["metadatas"][0][0]["name"], distance) |
||||
if distance > 1: |
||||
return None |
||||
|
||||
# return unknown_person, found_person, False |
||||
|
||||
print_blue("Name found peson:", query_results["documents"][0][0]) |
||||
found_person = FoundPerson( |
||||
db, |
||||
name=query_results["metadatas"][0][0]["name"], |
||||
key=query_results["metadatas"][0][0]["_key"], |
||||
) |
||||
|
||||
return found_person |
||||
|
||||
def identify(self): |
||||
|
||||
llm = LLM(small=True) |
||||
|
||||
self.found_person = self.find_person(self.unknown_person) |
||||
|
||||
if not self.found_person: |
||||
self.suggestions = [ |
||||
(None, i) for i in self.unknown_person.doc["mentioned_in_interrogation"] |
||||
] |
||||
|
||||
# Summarize the found persons info |
||||
self.found_person.make_summary() |
||||
|
||||
suggestions = [] |
||||
for interrogation_id in self.unknown_person.doc["mentioned_in_interrogation"]: |
||||
interrogation_data = db.collection("interrogations").get(interrogation_id) |
||||
text = interrogation_data["text"] |
||||
|
||||
answer_prompt = f'''I texten nedan omnämns en "{self.unknown_person.name}" och jag försöker förstå om det kan vara exempelvis ett felstavat namn eller smeknamn för en annan person.\n |
||||
TEXT: |
||||
"""{text}"""\n |
||||
|
||||
På andra ställen i polisens förundersökning finns en person som heter "{self.found_person.name}", och som beskrivs så här: |
||||
"""{self.found_person.summary}"""\n |
||||
Verkar det troligt att personen som kallas {self.unknown_person.name} är samma person som {self.found_person.name}? Svara bara JA eller NEJ, samt en kort förklaring till varför. |
||||
''' |
||||
answer = llm.generate(answer_prompt) |
||||
suggestions.append((answer, interrogation_data)) |
||||
|
||||
self.suggestions = suggestions |
||||
|
||||
def verify( |
||||
db, |
||||
answer=None, |
||||
unknown_person=None, |
||||
found_person=None, |
||||
interrogation_key=None, |
||||
): |
||||
""" |
||||
Verifies the answer for a person's identification in an interrogation. |
||||
|
||||
Args: |
||||
db: The database object. |
||||
answer (str): The answer for the person's identification. Can be "Yes", "No", or "Unknown". |
||||
person (dict): The person's information. |
||||
person_in_arango (dict): The person's information in ArangoDB. |
||||
text (str): The text mentioning the person in the interrogation. |
||||
interrogation_key (str): The key identifying the interrogation. |
||||
|
||||
Returns: |
||||
None |
||||
""" |
||||
print_blue("Answer:", answer) |
||||
# If the answer is Yes |
||||
if answer == "Yes": |
||||
unknown_person.doc["mentioned_in_interrogation"].remove(interrogation_key) |
||||
db.collection("persons").update(unknown_person.doc) |
||||
|
||||
found_person.doc["confirmed"] = True |
||||
found_person.doc["info"] += found_person.doc["info"] |
||||
found_person.doc["mentioned_in_interrogation"] += [ |
||||
"mentioned_in_interrogation" |
||||
] |
||||
|
||||
print("Updated person in arango:") |
||||
print_green( |
||||
db.collection("persons").insert( |
||||
found_person.doc, overwrite_mode="update" |
||||
) |
||||
) |
||||
if ( |
||||
unknown_person.doc["mentioned_in_interrogation"] == [] |
||||
and unknown_person.doc["_key"] != found_person.doc["_key"] |
||||
): |
||||
db.collection("other_persons").insert( |
||||
unknown_person.doc, overwrite_mode="update" |
||||
) |
||||
db.collection("persons").delete(unknown_person.doc, check_rev=False) |
||||
print_red(f"Removed {unknown_person.doc}") |
||||
|
||||
# If the answer is No |
||||
if answer == "No": |
||||
if "is_not" not in unknown_person.doc: |
||||
unknown_person.doc["is_not"] = [] |
||||
|
||||
unknown_person.doc["is_not"].append([found_person.doc["name"]]) |
||||
db.collection("persons").update( |
||||
unknown_person.doc, merge=True, check_rev=False |
||||
) |
||||
|
||||
# If the answer is Unknown |
||||
if answer == "Unknown": |
||||
db.collection("unknown").insert( |
||||
{"name": unknown_person.name, "interrogation": interrogation_key}, |
||||
overwrite=True, |
||||
) |
||||
|
||||
|
||||
class PersonFinder: |
||||
def __init__( |
||||
self, |
||||
names={}, |
||||
chunk_size=5000, |
||||
chunk_overlap=0, |
||||
separator="\n\n", |
||||
): |
||||
|
||||
self.names = names |
||||
self.llm = LLM( |
||||
chat=False, |
||||
small=True, |
||||
system_prompt="Du är en assistent som hjälper till att hitta personer i ett polisförhör. Svara bara när personen finns i den del du får, hitta inte på personer.", |
||||
) |
||||
self.text_splitter = CharacterTextSplitter( |
||||
separator="\n\n", |
||||
chunk_size=chunk_size, |
||||
chunk_overlap=chunk_overlap, |
||||
length_function=len, |
||||
is_separator_regex=False, |
||||
) |
||||
|
||||
def extract_names(self, chunk, extra_prompt=""): |
||||
|
||||
chunk_names = [] |
||||
# Find persons in the text |
||||
prompt = f'''Jag vill hitta alla personer som nämns i texten nedan:\n |
||||
"""{chunk}"""\n |
||||
Vilka personer nämns i texten? Svara ENBART med en pythonformaterad lista av namn. |
||||
Exempel på svar för att du ska förstå formen: |
||||
<exempel> |
||||
[namn1, namn2, namn3]. |
||||
</exempel |
||||
Var noga med att svara |
||||
{extra_prompt}''' |
||||
response = self.llm.generate(prompt) |
||||
response = re.sub(r"[^a-zA-ZåäöÅÄÖ\- ,]", "", response).replace(" namn ", "") |
||||
|
||||
for name in [i.strip() for i in response.split(",") if len(i) > 2]: |
||||
same_name = False |
||||
if name not in chunk_names and name not in self.names: |
||||
if self.names != []: |
||||
for n in list(self.names): |
||||
if name in n: |
||||
same_name = True |
||||
self.names[name] = self.names[n] |
||||
|
||||
if not same_name: |
||||
chunk_names.append(name) |
||||
|
||||
return chunk_names |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
|
||||
text = db.collection('rumors').get('Mikael_Sjostrom_2023-02-13_p.98') |
||||
person = PersonIdentifier( |
||||
doc={'name': 'Douglas', 'interrogation_key': "_'Larsson',_'_Neo'__2023-02-15_p.208"}) |
||||
person.find_with_llm() |
||||
@ -1,11 +0,0 @@ |
||||
from _arango import arango |
||||
from _llm import LLM |
||||
|
||||
llm = LLM(keep_alive=6000, chat=False) |
||||
|
||||
q = 'for doc in interrogations filter doc.reason != null return doc' |
||||
docs = [i for i in arango.db.aql.execute(q)] |
||||
|
||||
for doc in docs: |
||||
print("\033[92m", doc['person'], "\033[0m", doc['reason']) |
||||
|
||||
@ -1,23 +1,58 @@ |
||||
from random import choice |
||||
|
||||
|
||||
def print_green(*args): |
||||
text = '' |
||||
text = "" |
||||
for arg in args: |
||||
text += str(arg) + ' ' |
||||
text += str(arg) + " " |
||||
print(f"\033[92m{text}\033[0m") |
||||
|
||||
|
||||
def print_red(*args): |
||||
text = '' |
||||
text = "" |
||||
for arg in args: |
||||
text += str(arg) + ' ' |
||||
text += str(arg) + " " |
||||
print(f"\033[91m{text}\033[0m") |
||||
|
||||
|
||||
def print_yellow(*args): |
||||
text = '' |
||||
text = "" |
||||
for arg in args: |
||||
text += str(arg) + ' ' |
||||
text += str(arg) + " " |
||||
print(f"\033[93m{text}\033[0m") |
||||
|
||||
|
||||
def print_blue(*args): |
||||
text = '' |
||||
text = "" |
||||
for arg in args: |
||||
text += str(arg) + " " |
||||
print(f"\033[94m{text}\033[0m") |
||||
|
||||
|
||||
def print_purple(*args): |
||||
text = "" |
||||
for arg in args: |
||||
text += str(arg) + " " |
||||
print(f"\033[95m{text}\033[0m") |
||||
|
||||
|
||||
def choose_color(last_color_index): |
||||
colors = { |
||||
"blue": "\033[94m", |
||||
"green": "\033[92m", |
||||
"yellow": "\033[93m", |
||||
"red": "\033[91m", |
||||
"purple": "\033[95m", |
||||
} |
||||
color_keys = list(colors.keys()) |
||||
color_index = (last_color_index + 1) % len(color_keys) |
||||
color = color_keys[color_index] |
||||
return colors[color], color, color_index |
||||
|
||||
def print_rainbow(*args): |
||||
color_index = -1 |
||||
text = "" |
||||
for arg in args: |
||||
text += str(arg) + ' ' |
||||
print(f"\033[94m{text}\033[0m") |
||||
color_code, color, color_index = choose_color(color_index) |
||||
text += f"{color_code}{arg}\033[0m " |
||||
print(text) |
||||
@ -0,0 +1,15 @@ |
||||
from _llm import LLM |
||||
from _arango import arango |
||||
from print_color import * |
||||
|
||||
|
||||
llm = LLM(chat=False) |
||||
interrogations = list(arango.db.collection("interrogations").all()) |
||||
|
||||
for interrogation in interrogations: |
||||
text = interrogation['text'] |
||||
prompt = f'Vad sägs om lördagskvällen i texten nedan? \n\n"""{text}""" Jag vill veta vad som sägs i texten om lördagskvällen. Var noga med prsonre, namn och platser.' |
||||
|
||||
answer = llm.generate(prompt) |
||||
print_blue(interrogation['person']) |
||||
print(answer, '\n') |
||||
@ -0,0 +1,94 @@ |
||||
from _chroma import ChromaDB |
||||
from _openai import LLM_OpenAI as LLM |
||||
import streamlit as st |
||||
from print_color import * |
||||
|
||||
def get_docs(user_input): |
||||
docs = chroma.query('mala_interrogations', user_input, n_results=5) |
||||
return docs |
||||
|
||||
def generate_prompt(user_input, docs): |
||||
texts = [text for text in docs['documents'][0]] |
||||
metas = [{'person': meta['person'], 'date': meta['date']} for meta in docs['metadatas'][0]] |
||||
combined_data = list(zip(texts, metas)) |
||||
string = '' |
||||
for text, meta in combined_data: |
||||
# Do something with text and meta |
||||
# For example, print them |
||||
string += f'\n\nFrån förhör med {meta["person"]} {meta["date"]}:'.upper() |
||||
string += f'\n{text}\n\n' |
||||
|
||||
prompt = f'''Svara på frågan: {user_input}\n |
||||
Använd endast informationen nedan:\n |
||||
{string}\n |
||||
Skriv utförligt på svenska och var noga med detaljer som namn, plats och datum. |
||||
Får gärna med information från alla fem förhör om det är relevant.\n |
||||
{user_input}''' |
||||
answer = st.session_state.llm.generate(prompt) |
||||
return prompt |
||||
|
||||
st.set_page_config( |
||||
page_title="Malå", |
||||
) |
||||
|
||||
# Should not be reseted every run. |
||||
if "llm" not in st.session_state: |
||||
st.session_state.llm = LLM(chat=True, system_prompt='Du är assistent åt en journalist som går igenom förhör i en förundersökning. Svara bara utifrån den information du får. Svara alltid på svenska!') |
||||
|
||||
# Should be reset every run. |
||||
llm_checker = LLM(chat=True) |
||||
chroma = ChromaDB() |
||||
|
||||
|
||||
# Initialize chat history |
||||
if "messages" not in st.session_state: |
||||
st.session_state.messages = [] |
||||
|
||||
# Display chat messages from history on app rerun |
||||
for message in st.session_state.messages: |
||||
with st.chat_message(message["role"]): |
||||
st.markdown(message["content"]) |
||||
|
||||
# Accept user input |
||||
if user_input := st.chat_input("Fråga något om förhören."): |
||||
print_blue(user_input) |
||||
if len(st.session_state.messages) > 1: |
||||
history = '' |
||||
for message in st.session_state.messages: |
||||
history += f"{message['role']}: {message['content']}\n'" |
||||
prompt = f'En användare har ställt frågan "{user_input}" och här är chatthistoriken mellan användaren och en assistent:\n{history}\n\nVerkar "{user_input}" vara en uppföljningfråga eller en fristående fråga? Svara ENDAST med "uppföljning" eller "fristående".' |
||||
chat_completion = llm_checker.generate(prompt, stream=False) |
||||
answer = chat_completion.choices[0].message.content |
||||
|
||||
print_red(answer) |
||||
if 'uppföljning' in answer: |
||||
prompt=f'Använd historiken till att omformulera "{user_input}" till en helt fristående fråga. Frågan ska användas för att hitta information i förhören.' |
||||
chat_completion = llm_checker.generate(prompt, stream=False) |
||||
question2chroma = chat_completion.choices[0].message.content |
||||
if 'fristående' in answer: |
||||
question2chroma = user_input |
||||
|
||||
if 'None' in answer: |
||||
question2chroma = user_input |
||||
|
||||
print_yellow(question2chroma) |
||||
|
||||
else: |
||||
question2chroma = user_input |
||||
# Add user message to chat history |
||||
st.session_state.messages.append({"role": "user", "content": user_input}) |
||||
# Display user message in chat message container |
||||
with st.chat_message("user"): |
||||
st.markdown(user_input) |
||||
|
||||
# Display assistant response in chat message container |
||||
with st.chat_message("assistant"): |
||||
docs = get_docs(question2chroma) |
||||
prompt = generate_prompt(user_input, docs) |
||||
stream = st.session_state.llm.generate(prompt) |
||||
response = st.write_stream(stream) |
||||
st.session_state.llm.messages.append({'role': 'assistant', 'content': response}) |
||||
st.session_state.messages.append({"role": "assistant", "content": response}) |
||||
print() |
||||
|
||||
|
||||
Loading…
Reference in new issue