parent
b5ad23f652
commit
744b1f02f2
21 changed files with 1971 additions and 381 deletions
@ -0,0 +1,335 @@ |
|||||||
|
import streamlit as st |
||||||
|
from identify_person import identify, verify, find_person, UnverifiedPerson, FoundPerson |
||||||
|
from _arango import arango |
||||||
|
import re |
||||||
|
from fuzzywuzzy import process |
||||||
|
from _llm import LLM as LLM_garda |
||||||
|
from _openai import LLM_OpenAI as LLM |
||||||
|
from print_color import * |
||||||
|
from random import randint |
||||||
|
|
||||||
|
|
||||||
|
# from print_color import * |
||||||
|
print("Start") |
||||||
|
|
||||||
|
|
||||||
|
def reset_choices(): |
||||||
|
st.session_state.user_choice = None |
||||||
|
st.session_state.unconfirmed_choice = None |
||||||
|
st.session_state.custom_choice = None |
||||||
|
|
||||||
|
|
||||||
|
def check_if_dict_in_list(target_dict, list_of_dicts): |
||||||
|
|
||||||
|
target_key, target_value = list(target_dict.items())[0] |
||||||
|
for dict_ in list_of_dicts: |
||||||
|
key, value = list(dict_.items())[0] |
||||||
|
if key == target_key and value == target_value: |
||||||
|
return True |
||||||
|
return False |
||||||
|
|
||||||
|
|
||||||
|
def submitted(): |
||||||
|
st.session_state.next = True |
||||||
|
|
||||||
|
|
||||||
|
@st.cache_data() |
||||||
|
def sort_names_by_similarity(target_name, name_list): |
||||||
|
# Get a list of tuples (name, score) |
||||||
|
scored_names = process.extract(target_name, name_list, limit=len(name_list)) |
||||||
|
|
||||||
|
# Sort the list of tuples by score in descending order |
||||||
|
scored_names.sort(key=lambda x: x[1], reverse=True) |
||||||
|
|
||||||
|
# Extract the sorted list of names |
||||||
|
sorted_names = [name for name, score in scored_names] |
||||||
|
|
||||||
|
return sorted_names |
||||||
|
|
||||||
|
|
||||||
|
@st.cache_data() |
||||||
|
def get_persons(): |
||||||
|
return list(arango.db.collection("persons").all()) |
||||||
|
|
||||||
|
|
||||||
|
@st.cache_data() |
||||||
|
def get_unverified_persons(): |
||||||
|
q = "for doc in persons filter doc.confirmed != true return doc" |
||||||
|
return [i for i in db.aql.execute(q)] |
||||||
|
|
||||||
|
|
||||||
|
@st.cache_data() |
||||||
|
def get_suggestions(person): |
||||||
|
|
||||||
|
suggestion = identify(person) |
||||||
|
st.session_state.unverified_person = suggestion["unverified_person"] |
||||||
|
st.session_state.found_person = suggestion["found_person"] |
||||||
|
st.session_state.suggestions = suggestion["suggestions"] |
||||||
|
|
||||||
|
|
||||||
|
def caps(string): |
||||||
|
if "*" not in string: |
||||||
|
string = string.upper() |
||||||
|
else: |
||||||
|
string = string.replace("*", "") |
||||||
|
return string |
||||||
|
|
||||||
|
|
||||||
|
def get_unverified_person(): |
||||||
|
try: |
||||||
|
n = randint(0, len(st.session_state.unverified_persons) - 1) |
||||||
|
doc = st.session_state.unverified_persons.pop(n) |
||||||
|
st.session_state.unverified_person = UnverifiedPerson(doc) |
||||||
|
except ValueError: |
||||||
|
st.markdown(":green[Inga fler personer att verifiera.]") |
||||||
|
st.stop() |
||||||
|
|
||||||
|
|
||||||
|
st.set_page_config( |
||||||
|
page_title="Malå", |
||||||
|
) |
||||||
|
|
||||||
|
# Get URL parameters |
||||||
|
params = st.query_params |
||||||
|
param_person_key = params.get("person_key", None) |
||||||
|
|
||||||
|
db = arango.db |
||||||
|
|
||||||
|
# Add a session state to store the persons and unconfirmed persons etc |
||||||
|
if "next" not in st.session_state: |
||||||
|
st.session_state.next = False |
||||||
|
if "persons" not in st.session_state: |
||||||
|
st.session_state.persons = get_persons() |
||||||
|
|
||||||
|
all_persons_name_list = [] |
||||||
|
for person in st.session_state.persons: |
||||||
|
name = person["name"] |
||||||
|
if not person["confirmed"]: |
||||||
|
name += "*" |
||||||
|
all_persons_name_list.append(name) |
||||||
|
st.session_state.persons_names = all_persons_name_list |
||||||
|
st.session_state.persons_dict = { |
||||||
|
i["name"]: i["_key"] for i in st.session_state.persons |
||||||
|
} |
||||||
|
|
||||||
|
if "unverified_persons" not in st.session_state: |
||||||
|
if param_person_key: |
||||||
|
# If a person key is provided in the URL, only show that person |
||||||
|
st.session_state.unverified_persons = list( |
||||||
|
db.aql.execute( |
||||||
|
"for doc in persons filter doc._key == @key return doc", |
||||||
|
bind_vars={"key": param_person_key}, |
||||||
|
) |
||||||
|
) |
||||||
|
print_blue("param_person_key".upper(), st.session_state.unverified_persons) |
||||||
|
else: |
||||||
|
st.session_state.unverified_persons = get_unverified_persons() |
||||||
|
|
||||||
|
if "persons_names" not in st.session_state: |
||||||
|
st.session_state.persons_names = arango.get_persons(confirmed=False)["names"] |
||||||
|
|
||||||
|
if "user_choice" not in st.session_state: |
||||||
|
st.session_state.user_choice = None |
||||||
|
if "unconfirmed_choice" not in st.session_state: |
||||||
|
st.session_state.unconfirmed_choice = None |
||||||
|
if "custom_choice" not in st.session_state: |
||||||
|
st.session_state.custom_choice = None |
||||||
|
|
||||||
|
if ( |
||||||
|
"unverified_person" not in st.session_state |
||||||
|
or not st.session_state.unverified_persons |
||||||
|
): |
||||||
|
get_unverified_person() |
||||||
|
|
||||||
|
if "found_person" not in st.session_state: |
||||||
|
st.session_state.found_person = None |
||||||
|
if "suggestions" not in st.session_state: |
||||||
|
# Set new values for unverified_person, found_person and suggestions (as session_state) |
||||||
|
get_suggestions(st.session_state.unverified_person.__dict__) |
||||||
|
print_yellow("SUGGESTIONS", st.session_state.suggestions) |
||||||
|
|
||||||
|
if "suggestion" not in st.session_state: |
||||||
|
st.session_state.suggestion = st.session_state.suggestions.pop(0) |
||||||
|
|
||||||
|
|
||||||
|
# Get unconfirmed person, found person and answer from the suggestions |
||||||
|
unverified_person: UnverifiedPerson = st.session_state.unverified_person |
||||||
|
found_person: FoundPerson = st.session_state.found_person |
||||||
|
|
||||||
|
answer = st.session_state.suggestion[0] |
||||||
|
interrogation_doc = st.session_state.suggestion[1] |
||||||
|
if isinstance(interrogation_doc, str): |
||||||
|
interrogation_doc = db.collection("interrogations").get(interrogation_doc) |
||||||
|
text = interrogation_doc["text"] |
||||||
|
|
||||||
|
|
||||||
|
st.markdown( |
||||||
|
f'Namnet **"{st.session_state.unverified_person.name}"** används i **{len(st.session_state.unverified_person.mentioned_in_interrogation)}** förhör. Namnet kan syfta på olika personer i olika sammanhang så vi går igenom förhören ett och ett.' |
||||||
|
) |
||||||
|
|
||||||
|
if answer: |
||||||
|
answer = answer.replace("\n", " ") |
||||||
|
st.markdown( |
||||||
|
f"Är :blue[{unverified_person.name}] samma som :blue[{found_person.name}]?" |
||||||
|
) |
||||||
|
print(found_person.__dict__) |
||||||
|
st.write(f'(från förhör med {interrogation_doc["person"]})') |
||||||
|
if "JA" in answer: |
||||||
|
st.markdown(f"🤖\n:green[{answer.replace('JA ', '')}]") |
||||||
|
radio_index = 0 |
||||||
|
elif "NEJ" in answer: |
||||||
|
radio_index = 1 |
||||||
|
st.markdown(f"🤖\n:red[{answer.replace('NEJ ', '')}]") |
||||||
|
else: |
||||||
|
radio_index = None |
||||||
|
st.markdown(f"🤖\n{answer}") |
||||||
|
|
||||||
|
# Let the user expand for more info |
||||||
|
|
||||||
|
else: |
||||||
|
st.markdown(f"Vem är :blue[{unverified_person.name}]?") |
||||||
|
st.write(f'(från förhör med {interrogation_doc["person"]})') |
||||||
|
# Show the information about the suggested person |
||||||
|
|
||||||
|
|
||||||
|
# Edit and show the interrogation text |
||||||
|
with st.expander(f"Mer information om förhöret"): |
||||||
|
text = text.replace("\nFL:", "<br>**FL:** ").replace("\nDH:", "<br>**DH:** ") |
||||||
|
text = re.sub(r"\n(?!\n)", "", text) |
||||||
|
text = re.sub(r"\n\n+", "\n", text) |
||||||
|
text = text.replace("\n", "<br>").replace( |
||||||
|
unverified_person.name, f"**:red[{unverified_person.name}]**" |
||||||
|
) |
||||||
|
st.markdown(f"##### Förhöret:\n{text}", unsafe_allow_html=True) |
||||||
|
|
||||||
|
# A form to let the user select an alternative |
||||||
|
with st.form("select_alternative"): |
||||||
|
|
||||||
|
if answer: |
||||||
|
# Let the user decide if the suggested person is the same as the unconfirmed person |
||||||
|
st.session_state.user_choice = st.radio( |
||||||
|
"Select alternative", |
||||||
|
("Ja", "Nej", "Vet ej"), |
||||||
|
key="user_choice_radio", |
||||||
|
index=radio_index, |
||||||
|
) |
||||||
|
|
||||||
|
else: |
||||||
|
st.session_state.user_choice = None |
||||||
|
|
||||||
|
# Let the user select an alternative person |
||||||
|
alternatives = sort_names_by_similarity( |
||||||
|
unverified_person.name, st.session_state.persons_names |
||||||
|
) |
||||||
|
|
||||||
|
st.session_state.unconfirmed_choice = st.selectbox( |
||||||
|
"Välj någon som stämmer", |
||||||
|
alternatives, |
||||||
|
placeholder="Sök en annan", |
||||||
|
index=None, |
||||||
|
key="multiselect", |
||||||
|
format_func=caps, |
||||||
|
help="Personer i caps är bekräftade personer, välj någon av dem om det verkar stämma.", |
||||||
|
) |
||||||
|
|
||||||
|
# Let the user enter a custom alternative |
||||||
|
st.session_state.custom_choice = st.text_input( |
||||||
|
"Annan person", |
||||||
|
key="custom", |
||||||
|
help="Skriv in namnet på personen om det inte finns i listan. Var noga med stavningen.", |
||||||
|
) |
||||||
|
st.session_state.custom_choice = ( |
||||||
|
None if st.session_state.custom_choice == "" else st.session_state.custom_choice |
||||||
|
) |
||||||
|
|
||||||
|
# If the user has made a selection |
||||||
|
st.session_state.next = st.form_submit_button("Nästa", on_click=submitted) |
||||||
|
|
||||||
|
if st.session_state.next: |
||||||
|
if st.session_state.custom_choice: |
||||||
|
print("CUSTOM CHOICE", st.session_state.custom_choice) |
||||||
|
llm = LLM() |
||||||
|
info = llm.generate( |
||||||
|
f'Nedan är ett polisförhör där en person omnämns som "{unverified_person.name}".\n\n{interrogation_doc["text"]}\n\nSammanfatta informationen om {unverified_person.name} på ett detaljerat sätt, var noga med namn, platser, händelser och relationer. Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat. ' |
||||||
|
) |
||||||
|
person_in_arango = db.collection("persons").insert( |
||||||
|
{ |
||||||
|
"_key": arango.fix_key_name(st.session_state.custom_choice), |
||||||
|
"name": st.session_state.custom_choice, |
||||||
|
"info": [info], |
||||||
|
"mentioned_in_interrogation": [interrogation_doc["_key"]], |
||||||
|
"mentioned_as": [{unverified_person.name: interrogation_doc["_key"]}], |
||||||
|
} |
||||||
|
) |
||||||
|
verify( |
||||||
|
db, |
||||||
|
"Yes", |
||||||
|
unverified_person.doc, |
||||||
|
found_person.doc, |
||||||
|
interrogation_key=interrogation_doc["_key"], |
||||||
|
) |
||||||
|
|
||||||
|
elif st.session_state.unconfirmed_choice: |
||||||
|
|
||||||
|
unconfirmed_choice = st.session_state.unconfirmed_choice.replace( |
||||||
|
"*", "" |
||||||
|
).strip() |
||||||
|
|
||||||
|
print_yellow("OTHER CHOICE", unconfirmed_choice) |
||||||
|
doc = db.collection("persons").get( |
||||||
|
st.session_state.persons_dict[unconfirmed_choice] |
||||||
|
) |
||||||
|
found_person = FoundPerson( |
||||||
|
db, unconfirmed_choice, st.session_state.persons_dict[unconfirmed_choice] |
||||||
|
) |
||||||
|
print("NEW:", found_person.name) |
||||||
|
|
||||||
|
verify(db, "Yes", unverified_person, found_person, interrogation_doc["_key"]) |
||||||
|
|
||||||
|
elif st.session_state.user_choice == "Ja": |
||||||
|
print("USER CHOICE", st.session_state.user_choice) |
||||||
|
if "mentioned_as" not in found_person.doc: |
||||||
|
found_person.doc["mentioned_as"] = [] |
||||||
|
if not check_if_dict_in_list( |
||||||
|
{person["name"]: interrogation_doc["_key"]}, |
||||||
|
found_person.doc["mentioned_as"], |
||||||
|
): |
||||||
|
found_person.doc["mentioned_as"].append( |
||||||
|
{person["name"]: interrogation_doc["_key"]} |
||||||
|
) |
||||||
|
verify( |
||||||
|
db, |
||||||
|
answer="Yes", |
||||||
|
person=person, |
||||||
|
person_in_arango=found_person.doc, |
||||||
|
interrogation_key=interrogation_doc["_key"], |
||||||
|
) |
||||||
|
elif st.session_state.user_choice == "Nej": |
||||||
|
verify( |
||||||
|
db, |
||||||
|
"No", |
||||||
|
unverified_person=unverified_person.doc, |
||||||
|
found_person=found_person.doc, |
||||||
|
interrogation_key=interrogation_doc["_key"], |
||||||
|
) |
||||||
|
|
||||||
|
elif st.session_state.user_choice == "Vet ej": |
||||||
|
verify( |
||||||
|
db, |
||||||
|
"Unknown", |
||||||
|
unverified_person=unverified_person.doc, |
||||||
|
interrogation_key=interrogation_doc["_key"], |
||||||
|
) |
||||||
|
|
||||||
|
reset_choices() |
||||||
|
if not param_person_key: |
||||||
|
if st.session_state.suggestions != []: |
||||||
|
st.session_state.suggestion = st.session_state.suggestions.pop(0) |
||||||
|
else: |
||||||
|
get_unverified_person() |
||||||
|
get_suggestions(st.session_state.unverified_person.__dict__) |
||||||
|
st.session_state.suggestion = st.session_state.suggestions.pop(0) |
||||||
|
st.rerun() |
||||||
|
else: |
||||||
|
st.markdown(":green[Tack!] Du kan stäna de här fliken nu.") |
||||||
|
st.stop() |
||||||
@ -0,0 +1,76 @@ |
|||||||
|
from openai import OpenAI, RateLimitError |
||||||
|
from dotenv import load_dotenv |
||||||
|
import os |
||||||
|
from _llm import LLM as LLM_ollama |
||||||
|
from print_color import * |
||||||
|
from time import sleep |
||||||
|
load_dotenv() |
||||||
|
|
||||||
|
class LLM_OpenAI: |
||||||
|
def __init__( |
||||||
|
self, |
||||||
|
system_prompt='Svara alltid på svenska. Svara bara på det som efterfrågas. Om du inte kan svara, skriv "Jag vet inte".', |
||||||
|
chat=False, |
||||||
|
model="gpt-3.5-turbo-0125", |
||||||
|
max_tokens=24000, |
||||||
|
sleep_time=0 |
||||||
|
): |
||||||
|
self.chat = chat |
||||||
|
self.model = model |
||||||
|
self.temperature=0 |
||||||
|
self.max_tokens = max_tokens |
||||||
|
self.system_message = {"role": "system", "content": system_prompt} |
||||||
|
self.messages =[self.system_message] |
||||||
|
self.client = OpenAI( |
||||||
|
# This is the default and can be omitted |
||||||
|
api_key=os.getenv("OPEN_AI"), |
||||||
|
) |
||||||
|
self.llm_ollama = LLM_ollama(chat=False, stream=True) # For backup |
||||||
|
self.sleep_time = sleep_time |
||||||
|
|
||||||
|
def build_message(self, message): |
||||||
|
# Add the new message to the list |
||||||
|
self.messages.append({"role": "user", "content": message}) |
||||||
|
|
||||||
|
# Calculate the total token length of the messages |
||||||
|
total_tokens = sum([len((msg["content"])) for msg in self.messages]) |
||||||
|
|
||||||
|
# While the total token length exceeds the limit, remove the oldest messages |
||||||
|
while total_tokens > self.max_tokens: |
||||||
|
removed_message = self.messages.pop( |
||||||
|
1 |
||||||
|
) # Remove the oldest message (not the system message) |
||||||
|
total_tokens -= len((removed_message["content"])) |
||||||
|
|
||||||
|
def generate(self, prompt, stream=False, local=False): |
||||||
|
sleep(self.sleep_time) |
||||||
|
if self.chat: |
||||||
|
self.build_message(prompt) |
||||||
|
messages = self.messages |
||||||
|
else: |
||||||
|
messages = [self.system_message, {"role": "user", "content": prompt}] |
||||||
|
print(sum([len((msg["content"])) for msg in messages])) |
||||||
|
|
||||||
|
if local: |
||||||
|
response = self.llm_ollama.generate_stream(prompt) |
||||||
|
|
||||||
|
else: |
||||||
|
try: |
||||||
|
response = self.client.chat.completions.create( |
||||||
|
messages=messages, |
||||||
|
model=self.model, |
||||||
|
stream=stream |
||||||
|
) |
||||||
|
except RateLimitError as e: |
||||||
|
print_red(e) |
||||||
|
response = self.llm_ollama.generate_stream(prompt) |
||||||
|
|
||||||
|
if stream: |
||||||
|
return response |
||||||
|
else: |
||||||
|
answer = response.choices[0].message.content |
||||||
|
if self.chat: |
||||||
|
self.messages.append({"role": "assistant", "content": answer}) |
||||||
|
return answer |
||||||
|
|
||||||
|
|
||||||
@ -0,0 +1,43 @@ |
|||||||
|
from _arango import db |
||||||
|
from _llm import LLM |
||||||
|
from langchain_text_splitters import CharacterTextSplitter |
||||||
|
from print_color import * |
||||||
|
interrogations = list(db.aql.execute('for doc in interrogations filter doc.formatted_text == null return doc', count=True)) |
||||||
|
|
||||||
|
|
||||||
|
text_splitter = CharacterTextSplitter( |
||||||
|
separator="\n\n", |
||||||
|
chunk_size=2000, |
||||||
|
chunk_overlap=0, |
||||||
|
length_function=len, |
||||||
|
is_separator_regex=False, |
||||||
|
) |
||||||
|
|
||||||
|
|
||||||
|
for interrogation in interrogations: |
||||||
|
text = interrogation['text'] |
||||||
|
|
||||||
|
chunks = text_splitter.split_text(text) |
||||||
|
formated_chunks = [] |
||||||
|
for chunk in chunks: |
||||||
|
print_yellow(len(chunk)) |
||||||
|
|
||||||
|
llm = LLM(chat=False, system_prompt='Du formaterar text enligt med markdown för att göra den lättare att läsa. Använd inte rubriker, bara fet och stil. Om det förekommer en dialog fetmarkera den som talar, exempelvis ** DH: **. Namn ska göras fetade, även om det bara är ett förnamn. Svara alltid med EXAKT samma text som du fick, men formaterad. Svara alltid på svenska.') |
||||||
|
if 'name' not in interrogation: |
||||||
|
interrogation['name'] = interrogation['person'] |
||||||
|
name = interrogation['name'] |
||||||
|
|
||||||
|
prompt = f'''Kolla på texten nedan: \n\n\n{chunk}\n\n\nJag vill att du svarar med EXAKT samma text, men formaterad enligt markdown för att vara enklare att läsa. Formatera enligt följande: |
||||||
|
- Använd aldrig rubriker (#) |
||||||
|
- Om det är en längre dialog mellan förhörsledare (FL) och den hörde (DH) så formatera dem med fetstil, exempelvis **DH: **. |
||||||
|
- Gör namn på personer fetade, även om det bara är ett förnamn. Den förhörde – {name} – ska inte vara fetad utan normal text. |
||||||
|
Ibland är styckeindelningen inte korrekt, försök att göra det lättare att läsa. |
||||||
|
Svara ENBART med den formaterade texten, ingenting annat.''' |
||||||
|
formatted_chunk = llm.generate(prompt) |
||||||
|
print_blue(formatted_chunk) |
||||||
|
formated_chunks.append(formatted_chunk) |
||||||
|
formatted_text = '\n '.join(formated_chunks) |
||||||
|
interrogation['formatted_text'] = formatted_text |
||||||
|
db.collection('interrogations').update(interrogation, check_rev=False) |
||||||
|
|
||||||
|
|
||||||
@ -0,0 +1,26 @@ |
|||||||
|
from _arango import db |
||||||
|
from _llm import LLM |
||||||
|
from print_color import * |
||||||
|
relations = list(db.aql.execute('for doc in relations return doc', count=True)) |
||||||
|
|
||||||
|
for relation in relations: |
||||||
|
interrogation = db.collection('interrogations').get(relation['interrogation']) |
||||||
|
if not interrogation: |
||||||
|
print_red(relation) |
||||||
|
continue |
||||||
|
for i in ['to', 'from']: |
||||||
|
if 'name' not in interrogation: |
||||||
|
interrogation['name'] = interrogation['person'] |
||||||
|
db.collection('interrogations').update(interrogation, check_rev=False) |
||||||
|
if relation[i] == interrogation['name']: |
||||||
|
relation[i] = interrogation['person_id'] |
||||||
|
elif relation[i] == interrogation['person_mentioned_as']: |
||||||
|
relation[i] = interrogation['person_id'] |
||||||
|
for k, v in relation.items(): |
||||||
|
print_rainbow(k, v) |
||||||
|
print() |
||||||
|
for k, v in interrogation.items(): |
||||||
|
if k == 'text' or k == 'formatted_text': |
||||||
|
continue |
||||||
|
print_rainbow(k, v) |
||||||
|
db.collection('relations').update(relation, check_rev=False) |
||||||
@ -0,0 +1,34 @@ |
|||||||
|
from _llm import LLM |
||||||
|
from _arango import db |
||||||
|
from _chroma import chroma |
||||||
|
from print_color import * |
||||||
|
from identify_person import find_person |
||||||
|
|
||||||
|
|
||||||
|
llm = LLM(small=True) |
||||||
|
|
||||||
|
|
||||||
|
def check_from(relations): |
||||||
|
|
||||||
|
for relation in relations: |
||||||
|
interrogation = db.collection('interrogations').get(relation['interrogation']) |
||||||
|
text = f"Hörd person: {interrogation['person']}\n{interrogation['text']}" |
||||||
|
prompt = f"""Är "{relation['from']}" personen som förhörs i texten nedan?\n\n{text[:2000]}\n\nSvara enbart JA eller NEJ.""" |
||||||
|
answer = llm.generate(prompt) |
||||||
|
if 'JA' in answer: |
||||||
|
relation['from_key'] = interrogation['person_id'] |
||||||
|
db.collection('relations').update(relation, check_rev=False) |
||||||
|
print_rainbow(relation['from'], interrogation['person'], answer) |
||||||
|
|
||||||
|
|
||||||
|
q = "for doc in relations filter doc.from_key == null limit 10 return doc" #! Limit 10 |
||||||
|
relations = list(db.aql.execute(q)) |
||||||
|
|
||||||
|
for relation in relations: |
||||||
|
desc = '' |
||||||
|
for r in relation['relations']: |
||||||
|
desc += r['description'] + '\n' |
||||||
|
desc = desc.strip() |
||||||
|
print_green(relation['to']) |
||||||
|
print(find_person(name=relation['to'])) |
||||||
|
print() |
||||||
@ -0,0 +1,423 @@ |
|||||||
|
from _chroma import chroma |
||||||
|
from _arango import arango, db |
||||||
|
from _llm import LLM |
||||||
|
from print_color import * |
||||||
|
import difflib |
||||||
|
import re |
||||||
|
from langchain_text_splitters import CharacterTextSplitter |
||||||
|
|
||||||
|
# text_splitter = CharacterTextSplitter( |
||||||
|
# separator="\n\n", |
||||||
|
# chunk_size=8000, |
||||||
|
# chunk_overlap=0, |
||||||
|
# length_function=len, |
||||||
|
# is_separator_regex=False, |
||||||
|
# ) |
||||||
|
|
||||||
|
|
||||||
|
class Person: |
||||||
|
def __init__(self): |
||||||
|
self.info = None |
||||||
|
self.summary = None |
||||||
|
|
||||||
|
def make_summary(self): |
||||||
|
llm = LLM(chat=False, system_prompt="Du sammanfattar information om en person utifrån ett polisförhör. Sammanfattningen ska sedan användas för att göra en sökning i en vektordatabas.") |
||||||
|
info = self.info |
||||||
|
if not self.info or all([len(self.info) < 200, 'interrogation_key' in self.doc, 'name' in self.doc]): |
||||||
|
interrogation_text = db.collection("interrogations").get(self.doc['interrogation_key'])['text'] |
||||||
|
if len(interrogation_text) > 20000: |
||||||
|
if self.doc['name'] in interrogation_text: |
||||||
|
index = interrogation_text.find(self.doc['name']) |
||||||
|
if index < 1000: |
||||||
|
interrogation_text = interrogation_text[:8000] |
||||||
|
else: |
||||||
|
interrogation_text = interrogation_text[index-1000:][:8000] |
||||||
|
|
||||||
|
prompt = f"""Nedan är ett polisförhör:\n |
||||||
|
{interrogation_text}\n |
||||||
|
Jag är intresserad av en person som omnämns som "{self.doc['name']}". Gör en detaljerad sammanfattning av informationen om {self.name}. Var noga med relationer, namn och platser. Svara ENBART med informationen om personen, ingenting annat. Svara alltid på svenska!""" |
||||||
|
info = llm.generate(prompt) |
||||||
|
if self.info: |
||||||
|
info = self.info + "\n" + info |
||||||
|
print_rainbow(f'Info about: {self.name}', info) |
||||||
|
summary_prompt = f""""Nedan är olika bitar med information om en person:\n |
||||||
|
{info}\n |
||||||
|
Sammanfatta dessa på ett detaljerat sätt. Var noga med namn, platser, händelser och relationer. |
||||||
|
Använd bara sånt som finns i informationen. Svara ENBART med sammanfattningen, ingenting annat.""" |
||||||
|
self.summary = llm.generate(summary_prompt) |
||||||
|
|
||||||
|
|
||||||
|
class UnknownPerson(Person): |
||||||
|
def __init__(self, doc: dict): |
||||||
|
super().__init__() |
||||||
|
self.doc: dict = doc |
||||||
|
for k, v in self.doc.items(): |
||||||
|
setattr(self, k, v) |
||||||
|
if "info" in doc: |
||||||
|
self.info = "\n".join(doc["info"]) |
||||||
|
else: |
||||||
|
self.info = None |
||||||
|
if "name" in doc: |
||||||
|
self.name = doc["name"] |
||||||
|
else: |
||||||
|
self.name = "" |
||||||
|
|
||||||
|
|
||||||
|
class FoundPerson(Person): |
||||||
|
""" |
||||||
|
Represents a person found in ArangoDB. |
||||||
|
|
||||||
|
Attributes: |
||||||
|
name (str): The name of the person. |
||||||
|
info (str): Additional information about the person. |
||||||
|
key (str): A unique identifier for the person. |
||||||
|
doc (str): The persons document in ArangoDB. |
||||||
|
summary (str): A summary of the person's details. |
||||||
|
""" |
||||||
|
|
||||||
|
def __init__(self, db, name, key): |
||||||
|
super().__init__() |
||||||
|
self.name = name |
||||||
|
self.key = key |
||||||
|
self.doc = db.collection("persons").get(key) |
||||||
|
self.info = "\n".join(self.doc["info"]) |
||||||
|
|
||||||
|
|
||||||
|
class PersonIdentifier: |
||||||
|
def __init__( |
||||||
|
self, |
||||||
|
doc: dict = None, |
||||||
|
name: str = None, |
||||||
|
key: str = None, |
||||||
|
person: UnknownPerson = None, |
||||||
|
interrogation_key: str=None, |
||||||
|
text: str=None |
||||||
|
): |
||||||
|
self.doc: dict = doc |
||||||
|
self.name: str = name |
||||||
|
if 'name' in doc: |
||||||
|
self.name = doc['name'] |
||||||
|
self.key: str = key |
||||||
|
if '_key' in doc: |
||||||
|
self.key = doc['_key'] |
||||||
|
self.unknown_person: UnknownPerson = None |
||||||
|
self.found_person: FoundPerson = None |
||||||
|
self.suggestions = None |
||||||
|
self.interrogation_key = interrogation_key |
||||||
|
self.text = text |
||||||
|
|
||||||
|
self.get_unknown_person(doc, name, key, person) |
||||||
|
|
||||||
|
def get_unknown_person(self, doc, name, key, person): |
||||||
|
"""Get the unknown person.""" |
||||||
|
self.unknown_person = None |
||||||
|
self.found_person = None |
||||||
|
|
||||||
|
# Set the unknown person |
||||||
|
if person: |
||||||
|
self.unknown_person = person |
||||||
|
elif doc: |
||||||
|
self.unknown_person = UnknownPerson(doc) |
||||||
|
elif key and db.collection("persons").get(key): |
||||||
|
self.unknown_person = UnknownPerson(db.collection("persons").get(key)) |
||||||
|
else: |
||||||
|
assert key or name, "Both key and name are missing." |
||||||
|
self.unknown_person = UnknownPerson( |
||||||
|
{k: v for k, v in [("name", name), ("_key", key)] if v} |
||||||
|
) |
||||||
|
|
||||||
|
def check_name(self, text): |
||||||
|
"""Check if it's likely that person and answer_person are the same person.""" |
||||||
|
print_yellow(self.unknown_person.name, " - ", self.found_person.name) |
||||||
|
same = False |
||||||
|
|
||||||
|
# If person only has one name, first or last, compare that to first and last name of answer_person |
||||||
|
if len(self.unknown_person.name.strip().split()) == 1: |
||||||
|
llm = LLM() |
||||||
|
answer_first_name = self.found_person.name.split()[0].strip() |
||||||
|
answer_last_name = self.found_person.name.split()[-1].strip() |
||||||
|
|
||||||
|
if ( |
||||||
|
difflib.SequenceMatcher( |
||||||
|
None, self.unknown_person.name, answer_first_name |
||||||
|
).ratio() |
||||||
|
> 0.9 |
||||||
|
): |
||||||
|
if answer_last_name in text: |
||||||
|
same = True |
||||||
|
else: |
||||||
|
# Count how many time the first name appears in the first_names list |
||||||
|
first_names = [ |
||||||
|
i["name"].split()[0] for i in db.collection("persons").all() |
||||||
|
] |
||||||
|
first_name_count = first_names.count(answer_first_name) |
||||||
|
|
||||||
|
if first_name_count == 1: |
||||||
|
same = True |
||||||
|
else: |
||||||
|
llm = LLM(small=True) |
||||||
|
answer = llm.generate( |
||||||
|
f'Nämns någon med efternamnet "{answer_last_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"' |
||||||
|
) |
||||||
|
if "JA" in answer: |
||||||
|
same = True |
||||||
|
|
||||||
|
elif ( |
||||||
|
difflib.SequenceMatcher( |
||||||
|
None, self.unknown_person.name, answer_last_name |
||||||
|
).ratio() |
||||||
|
> 0.9 |
||||||
|
): |
||||||
|
if answer_first_name in text: |
||||||
|
same = True |
||||||
|
else: |
||||||
|
llm = LLM(small=True) |
||||||
|
answer = llm.generate( |
||||||
|
f'Nämns någon med förnamnet "{answer_first_name}" i texten nedan?\n\n"""{text[:5000]}"""\n\nNamnet behöver inte vara stavat på exakt samma sätt, men det ska vara samma namn. Svara "JA" eller "NEJ"' |
||||||
|
) |
||||||
|
if "JA" in answer: |
||||||
|
same = True |
||||||
|
|
||||||
|
else: |
||||||
|
name_similarity = difflib.SequenceMatcher( |
||||||
|
None, self.unknown_person.name, self.found_person.name |
||||||
|
).ratio() |
||||||
|
|
||||||
|
if name_similarity > 0.85: |
||||||
|
same = True |
||||||
|
|
||||||
|
return same |
||||||
|
|
||||||
|
def find_with_llm(self): |
||||||
|
if not self.unknown_person.summary: |
||||||
|
self.unknown_person.make_summary() |
||||||
|
llm = LLM(chat=True, system_prompt="Du hjälper till att ta reda på vad en person heter. Först skapar du meningar som ska användas för att söka i en vektordatabas, sedan använder du informationen du får där till att ta reda på vad personen heter. Svara alltid på svenska.") |
||||||
|
print_rainbow('Info bites:', self.unknown_person.summary) |
||||||
|
info_bites = llm.generate(f"Nedan är olika bitar med information om en person:\n\n {self.unknown_person.summary} \n\Dela upp den i 3-4 meningar där varje mening beskriver en specifik detalj om personen. Svara med en mening per rad. Svara ENBART med informationen om personen, ingenting annat.") |
||||||
|
querys = info_bites.split("\n") |
||||||
|
print_rainbow('Querys:', querys) |
||||||
|
chroma_docs = chroma.query( |
||||||
|
query_texts=querys, |
||||||
|
n_results=3, |
||||||
|
collection="mala_interrogations", |
||||||
|
) |
||||||
|
info = '' |
||||||
|
for answer in chroma_docs['documents']: |
||||||
|
for doc in answer: |
||||||
|
print_blue(doc) |
||||||
|
info += doc + "\n" |
||||||
|
|
||||||
|
prompt = f'''Nedan är en text där {self.name} nämns:\n\n{self.text}\n\nJag vill veta vem "{self.unknown_person.name}" är. Läs texten nedan för att se om du kan hitta personens fulla namn:\n |
||||||
|
{info}\n |
||||||
|
Vad heter "{self.unknown_person.name}"? Svara med förnamn och efternamn på formen "Förnamn Efternamn". Svara "None" om det inte går att säga utifrån informationen.''' |
||||||
|
print_yellow('Längd på info:', len(info)) |
||||||
|
print_rainbow('Prompt', prompt) |
||||||
|
answer = llm.generate(prompt) |
||||||
|
print_green(answer) |
||||||
|
|
||||||
|
|
||||||
|
def find_person(self): |
||||||
|
"""Finds a person in the Chroma db.""" |
||||||
|
|
||||||
|
if "is_not" in self.unknown_person.doc: |
||||||
|
list_filter_isnot = [self.unknown_person.name].append( |
||||||
|
self.unknown_person.doc["is_not"] |
||||||
|
) |
||||||
|
else: |
||||||
|
list_filter_isnot = [self.unknown_person.name] |
||||||
|
|
||||||
|
filter_isnot = {"name": {"$nin": list_filter_isnot}} |
||||||
|
|
||||||
|
query_results = chroma.query( |
||||||
|
query_texts=[self.unknown_person.name], |
||||||
|
n_results=1, |
||||||
|
where=filter_isnot, |
||||||
|
collection="mala_persons", |
||||||
|
) |
||||||
|
|
||||||
|
distance = query_results["distances"][0][0] |
||||||
|
print_purple(query_results["metadatas"][0][0]["name"], distance) |
||||||
|
|
||||||
|
if distance > 1: #! This is not really working... |
||||||
|
self.unknown_person.make_summary() |
||||||
|
query_results = chroma.query( |
||||||
|
query_texts=[self.unknown_person.summary], |
||||||
|
n_results=1, |
||||||
|
where=filter_isnot, |
||||||
|
collection="mala_persons_info", |
||||||
|
) |
||||||
|
distance = query_results["distances"][0][0] |
||||||
|
print_yellow(query_results["metadatas"][0][0]["name"], distance) |
||||||
|
if distance > 1: |
||||||
|
return None |
||||||
|
|
||||||
|
# return unknown_person, found_person, False |
||||||
|
|
||||||
|
print_blue("Name found peson:", query_results["documents"][0][0]) |
||||||
|
found_person = FoundPerson( |
||||||
|
db, |
||||||
|
name=query_results["metadatas"][0][0]["name"], |
||||||
|
key=query_results["metadatas"][0][0]["_key"], |
||||||
|
) |
||||||
|
|
||||||
|
return found_person |
||||||
|
|
||||||
|
def identify(self): |
||||||
|
|
||||||
|
llm = LLM(small=True) |
||||||
|
|
||||||
|
self.found_person = self.find_person(self.unknown_person) |
||||||
|
|
||||||
|
if not self.found_person: |
||||||
|
self.suggestions = [ |
||||||
|
(None, i) for i in self.unknown_person.doc["mentioned_in_interrogation"] |
||||||
|
] |
||||||
|
|
||||||
|
# Summarize the found persons info |
||||||
|
self.found_person.make_summary() |
||||||
|
|
||||||
|
suggestions = [] |
||||||
|
for interrogation_id in self.unknown_person.doc["mentioned_in_interrogation"]: |
||||||
|
interrogation_data = db.collection("interrogations").get(interrogation_id) |
||||||
|
text = interrogation_data["text"] |
||||||
|
|
||||||
|
answer_prompt = f'''I texten nedan omnämns en "{self.unknown_person.name}" och jag försöker förstå om det kan vara exempelvis ett felstavat namn eller smeknamn för en annan person.\n |
||||||
|
TEXT: |
||||||
|
"""{text}"""\n |
||||||
|
|
||||||
|
På andra ställen i polisens förundersökning finns en person som heter "{self.found_person.name}", och som beskrivs så här: |
||||||
|
"""{self.found_person.summary}"""\n |
||||||
|
Verkar det troligt att personen som kallas {self.unknown_person.name} är samma person som {self.found_person.name}? Svara bara JA eller NEJ, samt en kort förklaring till varför. |
||||||
|
''' |
||||||
|
answer = llm.generate(answer_prompt) |
||||||
|
suggestions.append((answer, interrogation_data)) |
||||||
|
|
||||||
|
self.suggestions = suggestions |
||||||
|
|
||||||
|
def verify( |
||||||
|
db, |
||||||
|
answer=None, |
||||||
|
unknown_person=None, |
||||||
|
found_person=None, |
||||||
|
interrogation_key=None, |
||||||
|
): |
||||||
|
""" |
||||||
|
Verifies the answer for a person's identification in an interrogation. |
||||||
|
|
||||||
|
Args: |
||||||
|
db: The database object. |
||||||
|
answer (str): The answer for the person's identification. Can be "Yes", "No", or "Unknown". |
||||||
|
person (dict): The person's information. |
||||||
|
person_in_arango (dict): The person's information in ArangoDB. |
||||||
|
text (str): The text mentioning the person in the interrogation. |
||||||
|
interrogation_key (str): The key identifying the interrogation. |
||||||
|
|
||||||
|
Returns: |
||||||
|
None |
||||||
|
""" |
||||||
|
print_blue("Answer:", answer) |
||||||
|
# If the answer is Yes |
||||||
|
if answer == "Yes": |
||||||
|
unknown_person.doc["mentioned_in_interrogation"].remove(interrogation_key) |
||||||
|
db.collection("persons").update(unknown_person.doc) |
||||||
|
|
||||||
|
found_person.doc["confirmed"] = True |
||||||
|
found_person.doc["info"] += found_person.doc["info"] |
||||||
|
found_person.doc["mentioned_in_interrogation"] += [ |
||||||
|
"mentioned_in_interrogation" |
||||||
|
] |
||||||
|
|
||||||
|
print("Updated person in arango:") |
||||||
|
print_green( |
||||||
|
db.collection("persons").insert( |
||||||
|
found_person.doc, overwrite_mode="update" |
||||||
|
) |
||||||
|
) |
||||||
|
if ( |
||||||
|
unknown_person.doc["mentioned_in_interrogation"] == [] |
||||||
|
and unknown_person.doc["_key"] != found_person.doc["_key"] |
||||||
|
): |
||||||
|
db.collection("other_persons").insert( |
||||||
|
unknown_person.doc, overwrite_mode="update" |
||||||
|
) |
||||||
|
db.collection("persons").delete(unknown_person.doc, check_rev=False) |
||||||
|
print_red(f"Removed {unknown_person.doc}") |
||||||
|
|
||||||
|
# If the answer is No |
||||||
|
if answer == "No": |
||||||
|
if "is_not" not in unknown_person.doc: |
||||||
|
unknown_person.doc["is_not"] = [] |
||||||
|
|
||||||
|
unknown_person.doc["is_not"].append([found_person.doc["name"]]) |
||||||
|
db.collection("persons").update( |
||||||
|
unknown_person.doc, merge=True, check_rev=False |
||||||
|
) |
||||||
|
|
||||||
|
# If the answer is Unknown |
||||||
|
if answer == "Unknown": |
||||||
|
db.collection("unknown").insert( |
||||||
|
{"name": unknown_person.name, "interrogation": interrogation_key}, |
||||||
|
overwrite=True, |
||||||
|
) |
||||||
|
|
||||||
|
|
||||||
|
class PersonFinder: |
||||||
|
def __init__( |
||||||
|
self, |
||||||
|
names={}, |
||||||
|
chunk_size=5000, |
||||||
|
chunk_overlap=0, |
||||||
|
separator="\n\n", |
||||||
|
): |
||||||
|
|
||||||
|
self.names = names |
||||||
|
self.llm = LLM( |
||||||
|
chat=False, |
||||||
|
small=True, |
||||||
|
system_prompt="Du är en assistent som hjälper till att hitta personer i ett polisförhör. Svara bara när personen finns i den del du får, hitta inte på personer.", |
||||||
|
) |
||||||
|
self.text_splitter = CharacterTextSplitter( |
||||||
|
separator="\n\n", |
||||||
|
chunk_size=chunk_size, |
||||||
|
chunk_overlap=chunk_overlap, |
||||||
|
length_function=len, |
||||||
|
is_separator_regex=False, |
||||||
|
) |
||||||
|
|
||||||
|
def extract_names(self, chunk, extra_prompt=""): |
||||||
|
|
||||||
|
chunk_names = [] |
||||||
|
# Find persons in the text |
||||||
|
prompt = f'''Jag vill hitta alla personer som nämns i texten nedan:\n |
||||||
|
"""{chunk}"""\n |
||||||
|
Vilka personer nämns i texten? Svara ENBART med en pythonformaterad lista av namn. |
||||||
|
Exempel på svar för att du ska förstå formen: |
||||||
|
<exempel> |
||||||
|
[namn1, namn2, namn3]. |
||||||
|
</exempel |
||||||
|
Var noga med att svara |
||||||
|
{extra_prompt}''' |
||||||
|
response = self.llm.generate(prompt) |
||||||
|
response = re.sub(r"[^a-zA-ZåäöÅÄÖ\- ,]", "", response).replace(" namn ", "") |
||||||
|
|
||||||
|
for name in [i.strip() for i in response.split(",") if len(i) > 2]: |
||||||
|
same_name = False |
||||||
|
if name not in chunk_names and name not in self.names: |
||||||
|
if self.names != []: |
||||||
|
for n in list(self.names): |
||||||
|
if name in n: |
||||||
|
same_name = True |
||||||
|
self.names[name] = self.names[n] |
||||||
|
|
||||||
|
if not same_name: |
||||||
|
chunk_names.append(name) |
||||||
|
|
||||||
|
return chunk_names |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
|
||||||
|
text = db.collection('rumors').get('Mikael_Sjostrom_2023-02-13_p.98') |
||||||
|
person = PersonIdentifier( |
||||||
|
doc={'name': 'Douglas', 'interrogation_key': "_'Larsson',_'_Neo'__2023-02-15_p.208"}) |
||||||
|
person.find_with_llm() |
||||||
@ -1,11 +0,0 @@ |
|||||||
from _arango import arango |
|
||||||
from _llm import LLM |
|
||||||
|
|
||||||
llm = LLM(keep_alive=6000, chat=False) |
|
||||||
|
|
||||||
q = 'for doc in interrogations filter doc.reason != null return doc' |
|
||||||
docs = [i for i in arango.db.aql.execute(q)] |
|
||||||
|
|
||||||
for doc in docs: |
|
||||||
print("\033[92m", doc['person'], "\033[0m", doc['reason']) |
|
||||||
|
|
||||||
@ -1,23 +1,58 @@ |
|||||||
|
from random import choice |
||||||
|
|
||||||
|
|
||||||
def print_green(*args): |
def print_green(*args): |
||||||
text = '' |
text = "" |
||||||
for arg in args: |
for arg in args: |
||||||
text += str(arg) + ' ' |
text += str(arg) + " " |
||||||
print(f"\033[92m{text}\033[0m") |
print(f"\033[92m{text}\033[0m") |
||||||
|
|
||||||
|
|
||||||
def print_red(*args): |
def print_red(*args): |
||||||
text = '' |
text = "" |
||||||
for arg in args: |
for arg in args: |
||||||
text += str(arg) + ' ' |
text += str(arg) + " " |
||||||
print(f"\033[91m{text}\033[0m") |
print(f"\033[91m{text}\033[0m") |
||||||
|
|
||||||
|
|
||||||
def print_yellow(*args): |
def print_yellow(*args): |
||||||
text = '' |
text = "" |
||||||
for arg in args: |
for arg in args: |
||||||
text += str(arg) + ' ' |
text += str(arg) + " " |
||||||
print(f"\033[93m{text}\033[0m") |
print(f"\033[93m{text}\033[0m") |
||||||
|
|
||||||
|
|
||||||
def print_blue(*args): |
def print_blue(*args): |
||||||
text = '' |
text = "" |
||||||
|
for arg in args: |
||||||
|
text += str(arg) + " " |
||||||
|
print(f"\033[94m{text}\033[0m") |
||||||
|
|
||||||
|
|
||||||
|
def print_purple(*args): |
||||||
|
text = "" |
||||||
|
for arg in args: |
||||||
|
text += str(arg) + " " |
||||||
|
print(f"\033[95m{text}\033[0m") |
||||||
|
|
||||||
|
|
||||||
|
def choose_color(last_color_index): |
||||||
|
colors = { |
||||||
|
"blue": "\033[94m", |
||||||
|
"green": "\033[92m", |
||||||
|
"yellow": "\033[93m", |
||||||
|
"red": "\033[91m", |
||||||
|
"purple": "\033[95m", |
||||||
|
} |
||||||
|
color_keys = list(colors.keys()) |
||||||
|
color_index = (last_color_index + 1) % len(color_keys) |
||||||
|
color = color_keys[color_index] |
||||||
|
return colors[color], color, color_index |
||||||
|
|
||||||
|
def print_rainbow(*args): |
||||||
|
color_index = -1 |
||||||
|
text = "" |
||||||
for arg in args: |
for arg in args: |
||||||
text += str(arg) + ' ' |
color_code, color, color_index = choose_color(color_index) |
||||||
print(f"\033[94m{text}\033[0m") |
text += f"{color_code}{arg}\033[0m " |
||||||
|
print(text) |
||||||
@ -0,0 +1,15 @@ |
|||||||
|
from _llm import LLM |
||||||
|
from _arango import arango |
||||||
|
from print_color import * |
||||||
|
|
||||||
|
|
||||||
|
llm = LLM(chat=False) |
||||||
|
interrogations = list(arango.db.collection("interrogations").all()) |
||||||
|
|
||||||
|
for interrogation in interrogations: |
||||||
|
text = interrogation['text'] |
||||||
|
prompt = f'Vad sägs om lördagskvällen i texten nedan? \n\n"""{text}""" Jag vill veta vad som sägs i texten om lördagskvällen. Var noga med prsonre, namn och platser.' |
||||||
|
|
||||||
|
answer = llm.generate(prompt) |
||||||
|
print_blue(interrogation['person']) |
||||||
|
print(answer, '\n') |
||||||
@ -0,0 +1,94 @@ |
|||||||
|
from _chroma import ChromaDB |
||||||
|
from _openai import LLM_OpenAI as LLM |
||||||
|
import streamlit as st |
||||||
|
from print_color import * |
||||||
|
|
||||||
|
def get_docs(user_input): |
||||||
|
docs = chroma.query('mala_interrogations', user_input, n_results=5) |
||||||
|
return docs |
||||||
|
|
||||||
|
def generate_prompt(user_input, docs): |
||||||
|
texts = [text for text in docs['documents'][0]] |
||||||
|
metas = [{'person': meta['person'], 'date': meta['date']} for meta in docs['metadatas'][0]] |
||||||
|
combined_data = list(zip(texts, metas)) |
||||||
|
string = '' |
||||||
|
for text, meta in combined_data: |
||||||
|
# Do something with text and meta |
||||||
|
# For example, print them |
||||||
|
string += f'\n\nFrån förhör med {meta["person"]} {meta["date"]}:'.upper() |
||||||
|
string += f'\n{text}\n\n' |
||||||
|
|
||||||
|
prompt = f'''Svara på frågan: {user_input}\n |
||||||
|
Använd endast informationen nedan:\n |
||||||
|
{string}\n |
||||||
|
Skriv utförligt på svenska och var noga med detaljer som namn, plats och datum. |
||||||
|
Får gärna med information från alla fem förhör om det är relevant.\n |
||||||
|
{user_input}''' |
||||||
|
answer = st.session_state.llm.generate(prompt) |
||||||
|
return prompt |
||||||
|
|
||||||
|
st.set_page_config( |
||||||
|
page_title="Malå", |
||||||
|
) |
||||||
|
|
||||||
|
# Should not be reseted every run. |
||||||
|
if "llm" not in st.session_state: |
||||||
|
st.session_state.llm = LLM(chat=True, system_prompt='Du är assistent åt en journalist som går igenom förhör i en förundersökning. Svara bara utifrån den information du får. Svara alltid på svenska!') |
||||||
|
|
||||||
|
# Should be reset every run. |
||||||
|
llm_checker = LLM(chat=True) |
||||||
|
chroma = ChromaDB() |
||||||
|
|
||||||
|
|
||||||
|
# Initialize chat history |
||||||
|
if "messages" not in st.session_state: |
||||||
|
st.session_state.messages = [] |
||||||
|
|
||||||
|
# Display chat messages from history on app rerun |
||||||
|
for message in st.session_state.messages: |
||||||
|
with st.chat_message(message["role"]): |
||||||
|
st.markdown(message["content"]) |
||||||
|
|
||||||
|
# Accept user input |
||||||
|
if user_input := st.chat_input("Fråga något om förhören."): |
||||||
|
print_blue(user_input) |
||||||
|
if len(st.session_state.messages) > 1: |
||||||
|
history = '' |
||||||
|
for message in st.session_state.messages: |
||||||
|
history += f"{message['role']}: {message['content']}\n'" |
||||||
|
prompt = f'En användare har ställt frågan "{user_input}" och här är chatthistoriken mellan användaren och en assistent:\n{history}\n\nVerkar "{user_input}" vara en uppföljningfråga eller en fristående fråga? Svara ENDAST med "uppföljning" eller "fristående".' |
||||||
|
chat_completion = llm_checker.generate(prompt, stream=False) |
||||||
|
answer = chat_completion.choices[0].message.content |
||||||
|
|
||||||
|
print_red(answer) |
||||||
|
if 'uppföljning' in answer: |
||||||
|
prompt=f'Använd historiken till att omformulera "{user_input}" till en helt fristående fråga. Frågan ska användas för att hitta information i förhören.' |
||||||
|
chat_completion = llm_checker.generate(prompt, stream=False) |
||||||
|
question2chroma = chat_completion.choices[0].message.content |
||||||
|
if 'fristående' in answer: |
||||||
|
question2chroma = user_input |
||||||
|
|
||||||
|
if 'None' in answer: |
||||||
|
question2chroma = user_input |
||||||
|
|
||||||
|
print_yellow(question2chroma) |
||||||
|
|
||||||
|
else: |
||||||
|
question2chroma = user_input |
||||||
|
# Add user message to chat history |
||||||
|
st.session_state.messages.append({"role": "user", "content": user_input}) |
||||||
|
# Display user message in chat message container |
||||||
|
with st.chat_message("user"): |
||||||
|
st.markdown(user_input) |
||||||
|
|
||||||
|
# Display assistant response in chat message container |
||||||
|
with st.chat_message("assistant"): |
||||||
|
docs = get_docs(question2chroma) |
||||||
|
prompt = generate_prompt(user_input, docs) |
||||||
|
stream = st.session_state.llm.generate(prompt) |
||||||
|
response = st.write_stream(stream) |
||||||
|
st.session_state.llm.messages.append({'role': 'assistant', 'content': response}) |
||||||
|
st.session_state.messages.append({"role": "assistant", "content": response}) |
||||||
|
print() |
||||||
|
|
||||||
|
|
||||||
Loading…
Reference in new issue