main
parent
f3ee5ba669
commit
9e7e23f0fa
12 changed files with 788 additions and 14 deletions
@ -0,0 +1,51 @@ |
||||
import chromadb as db |
||||
from chromadb.utils import embedding_functions |
||||
from chromadb.config import Settings |
||||
from chromadb.api.client import Client |
||||
from chromadb.api.models.Collection import Collection |
||||
|
||||
|
||||
|
||||
class ChromaDB: |
||||
""" |
||||
A class representing a Chroma database. |
||||
""" |
||||
|
||||
def __init__(self, host: str = "192.168.1.10", port: int = 8001) -> None: |
||||
""" |
||||
Initializes a ChromaDB object running on specified port. |
||||
|
||||
Args: |
||||
host (str, optional): The host address of the Chroma database. Defaults to "192.168.1.10". |
||||
port (int, optional): The port number of the Chroma database. Defaults to 8001. |
||||
""" |
||||
self.client: Client = db.HttpClient( |
||||
settings=Settings(anonymized_telemetry=False), |
||||
host=host, |
||||
port=port, |
||||
) |
||||
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction( |
||||
api_key='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', |
||||
model_name="KBLab/sentence-bert-swedish-cased" |
||||
) |
||||
self.embedding_function: embedding_functions = huggingface_ef |
||||
|
||||
def print_collections(self): |
||||
""" |
||||
Prints all collections in the database. |
||||
""" |
||||
collections = self.client.list_collections() |
||||
for collection in collections: |
||||
print(collection.name) |
||||
|
||||
# Initialize the ChromaDB object |
||||
chroma = ChromaDB() |
||||
|
||||
if __name__ == '__main__': |
||||
|
||||
chroma.client.delete_collection('mala_persons') |
||||
col = chroma.client.get_or_create_collection('mala_persons') |
||||
|
||||
print(col.count()) |
||||
|
||||
|
||||
@ -0,0 +1,26 @@ |
||||
from _arango import arango |
||||
from _llm import LLM |
||||
db = arango.db |
||||
|
||||
db.create_collection("rumors_relations", edge=True) |
||||
|
||||
q = 'for doc in rumors return doc' |
||||
rumors = list(db.aql.execute(q)) |
||||
persons = list(db.collection("persons").all()) |
||||
persons_string = "\n".join([i["name"] for i in persons]) |
||||
llm = LLM(chat=False) |
||||
|
||||
for rumor in rumors: |
||||
|
||||
heard_from = rumor["heard_from"] |
||||
|
||||
prompt = f'Nedan är en lista med personer:\n\n{persons_string}\n\nVem av dessa tror du att {heard_from} är?\n\nHär är mer kontext som det hjälper:\n\n{rumor['sexual_summary']} \n\nSvara med namn och hur personen nämns i texten på formen "namn;hur personen nämns\n".' |
||||
response = llm.generate(prompt) |
||||
|
||||
db.collection("rumors_relations").insert( |
||||
{ |
||||
"_from": person["_id"], |
||||
"_to": rumor["_id"], |
||||
"context": "rumor", |
||||
} |
||||
) |
||||
@ -0,0 +1,31 @@ |
||||
import streamlit as st |
||||
#from identify_person import find_person |
||||
# from _arango import arango |
||||
|
||||
|
||||
# db = arango.db |
||||
# persons = list(db.collection('persons').all()) |
||||
|
||||
# q = 'for doc in persons filter doc.other == true return doc' |
||||
# other_persons = [i for i in db.aql.execute(q)] |
||||
|
||||
print('Start') |
||||
st.write('Start') |
||||
# start_button = st.button('Start') |
||||
# if start_button: |
||||
# st.write('Started') |
||||
# for person in other_persons: |
||||
# answer = find_person(person) |
||||
# if not answer: |
||||
# continue |
||||
|
||||
# for i in answer: |
||||
# answer, person_in_arango, text = i |
||||
|
||||
# st.write(f"Answer: {answer}") |
||||
# st.write(f"Person in Arango: {person_in_arango}") |
||||
# st.write(f"Interrogation: {text}") |
||||
|
||||
# st.stop() |
||||
|
||||
|
||||
@ -0,0 +1,21 @@ |
||||
import re |
||||
from _arango import arango |
||||
from _llm import LLM |
||||
from pprint import pprint |
||||
from pprint import pprint |
||||
from langchain_text_splitters import CharacterTextSplitter |
||||
import multiprocessing |
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
db = arango.db |
||||
|
||||
q = 'for doc in interrogations return doc' |
||||
interrogations = list(db.aql.execute(q)) |
||||
|
||||
with multiprocessing.Pool() as pool: |
||||
results = pool.map(process_interrogation, interrogations) |
||||
|
||||
@ -0,0 +1,270 @@ |
||||
import re |
||||
from _arango import arango |
||||
from _llm import LLM |
||||
from pprint import pprint |
||||
from pprint import pprint |
||||
from langchain_text_splitters import CharacterTextSplitter |
||||
import multiprocessing |
||||
|
||||
class Interrogation: |
||||
def __init__(self, interrogation): |
||||
self.interrogation = interrogation |
||||
self.llm = LLM(chat=True) |
||||
self.llm_checker = LLM(chat=False) |
||||
self.text = interrogation['text'] |
||||
|
||||
# Info to collect |
||||
self.sexual_content = None |
||||
self.sexual_info = [] |
||||
self.sexual_content_description = None |
||||
self.sexual_content_date = None |
||||
self.self_experience = None |
||||
self.self_involvement = None |
||||
self.self_involvement_type = None |
||||
self.heard_about = None |
||||
self.heard_from = None |
||||
self.sexual_chunk = None |
||||
self.sexual_summary = None |
||||
self.self_heard_from_id = None |
||||
|
||||
self.text_splitter = CharacterTextSplitter( |
||||
separator="\n\n", |
||||
chunk_size=3000, |
||||
chunk_overlap=0, |
||||
length_function=len, |
||||
is_separator_regex=False, |
||||
|
||||
) |
||||
self.chunks = self.text_splitter.split_text(self.text) |
||||
|
||||
if 'mentioned_persons' in interrogation: |
||||
q = f'''for doc in persons filter doc._id in ["{'","'.join(interrogation["mentioned_persons"])}"] return doc''' |
||||
print(q) |
||||
self.mentioned_persons = list(arango.db.aql.execute(q)) |
||||
self.mentioned_in_interrogation = [i['name'] for i in self.mentioned_persons] |
||||
self.mentioned_in_interrogations_dict = {i['name']: i['_id'] for i in self.mentioned_persons} |
||||
print(self.mentioned_in_interrogation) |
||||
|
||||
else: |
||||
self.mentioned_in_interrogation = None |
||||
|
||||
|
||||
def find_sexual_content(self, chunk, check_text=False): |
||||
prompt = f''' |
||||
Texten nedan är en del av ett polisförhör. |
||||
|
||||
TEXT: |
||||
"""{chunk}""" |
||||
|
||||
Jag vill veta om någonting i förhöret handlar om eller anspelar på något av: |
||||
- Sexuella olämpligheter |
||||
- Sexuella inviter |
||||
- Övergrepp |
||||
- Pedofili |
||||
- Grooming |
||||
|
||||
Svara med "JA" eller "NEJ". |
||||
Svar: |
||||
''' |
||||
|
||||
if check_text: |
||||
response = self.llm_checker.generate(prompt) |
||||
else: |
||||
response = self.llm.generate(prompt) |
||||
|
||||
if 'JA' in response: |
||||
sexual_content = True |
||||
elif 'NEJ' in response: |
||||
sexual_content = False |
||||
|
||||
else: |
||||
sexual_content = None |
||||
|
||||
if check_text: |
||||
return sexual_content |
||||
|
||||
self.sexual_content = sexual_content |
||||
|
||||
if sexual_content: |
||||
self.sexual_chunk = chunk |
||||
prompt = f'''Beskriv det sexuella innehållet i förhöret.''' |
||||
self.sexual_content_description = self.llm.generate(prompt) |
||||
self.extract_sexual_info(chunk) |
||||
prompt = f'''Ungefär när i tiden hände det som personen berättar om?''' |
||||
self.sexual_content_date = self.llm.generate(prompt) |
||||
|
||||
|
||||
def find_self_experience(self): |
||||
prompt = f'Har personen som förhörs själv varit med om något av det som beskrivs? Svara ENBART med "JA" eller "NEJ".' |
||||
|
||||
response = self.llm.generate(prompt) |
||||
if 'JA' in response: |
||||
self.self_experience = True |
||||
elif 'NEJ' in response: |
||||
self.self_experience = False |
||||
else: |
||||
self.self_experience = None |
||||
|
||||
|
||||
def find_self_involvement(self): |
||||
prompt = f'Har personen som förhörs själv varit inblandad på något sätt? Svara ENBART med "JA" eller "NEJ".' |
||||
|
||||
response = self.llm.generate(prompt) |
||||
if 'JA' in response: |
||||
self.self_involvement = True |
||||
prompt = f'''På vilket sätt har personen som förhörs varit inblandad?''' |
||||
self.self_involvement_type = self.llm.generate(prompt) |
||||
elif 'NEJ' in response: |
||||
self.self_involvement = False |
||||
else: |
||||
self.self_involvement = None |
||||
|
||||
def find_heard_about(self): |
||||
prompt = f'''Har personen hört talas om något av det som beskrivs? Svara ENBART med "JA" eller "NEJ".''' |
||||
|
||||
response = self.llm.generate(prompt) |
||||
if 'JA' in response: |
||||
self.heard_about = True |
||||
self.find_heard_from() |
||||
elif 'NEJ' in response: |
||||
self.heard_about = False |
||||
else: |
||||
self.heard_about = None |
||||
|
||||
def find_heard_from(self): |
||||
prompt = f'Av vem har personen hört det som beskrivs? Svara bara med namnet på personen, eller vad personen kallas.' |
||||
|
||||
heard_from = self.llm.generate(prompt) |
||||
|
||||
if self.mentioned_in_interrogation: |
||||
mentioned_in_interrogation = '\n'.join(self.mentioned_in_interrogation) |
||||
prompt = f'''Jag behöver identifiera vem {heard_from} är, verkar {heard_from} vara någon av följande personer:\n |
||||
|
||||
{mentioned_in_interrogation} |
||||
|
||||
Svara ENBART med med namnet på personen det skulle kunna vara. Om du inte vet svara "Jag vet inte". |
||||
''' |
||||
heard_from_answer = self.llm.generate(prompt) |
||||
if heard_from_answer in self.mentioned_in_interrogation: |
||||
self.heard_from = heard_from_answer |
||||
self.self_heard_from_id = self.mentioned_in_interrogations_dict[heard_from_answer] |
||||
else: |
||||
mentioned_info = '\n\n'.join([f'{i["name"].upper()}\n{i["info"]}' for i in self.mentioned_persons]) |
||||
prompt = f'''Här är mer information om möjliga personer:\n |
||||
{mentioned_info}\n |
||||
Kan du utifrån den säga vem {heard_from} är? Svara BARA med namnet på personen ur listanÄr du inte säker så svara "Jag vet inte". |
||||
''' |
||||
heard_from_answer_info = self.llm.generate(prompt) |
||||
if heard_from_answer_info in self.mentioned_in_interrogation: |
||||
self.heard_from = heard_from_answer_info |
||||
self.self_heard_from_id = self.mentioned_in_interrogations_dict[heard_from_answer_info] |
||||
if not self.heard_from: |
||||
self.heard_from = heard_from |
||||
|
||||
|
||||
def create_arango_doc(self): |
||||
return { |
||||
'_key': self.interrogation['_key'], |
||||
'sexual_content': self.sexual_content, |
||||
'sexual_content_description': self.sexual_content_description, |
||||
'self_experience': self.self_experience, |
||||
'self_involvement': self.self_involvement, |
||||
'self_involvement_type': self.self_involvement_type, |
||||
'heard_about': self.heard_about, |
||||
'heard_from': self.heard_from, |
||||
'interrogation_date': self.interrogation['date'], |
||||
'sexual_content_date': self.sexual_content_date, |
||||
'sexual_info': '\n'.join(self.sexual_info), |
||||
'sexual_summary': self.sexual_summary, |
||||
} |
||||
|
||||
|
||||
def extract_sexual_info(self, chunk): |
||||
|
||||
sexual_content = self.find_sexual_content(chunk, check_text=True) |
||||
|
||||
if sexual_content: |
||||
prompt = f''' |
||||
Jag samlar uppgifter ur ett polisförhör och är intresserad av uppgifter som har att göra med eller anspelar på något av följande: |
||||
|
||||
- Sexuella olämpligheter |
||||
- Sexuella inviter |
||||
- Övergrepp |
||||
- Pedofili |
||||
- Grooming |
||||
|
||||
Texten nedan är nästa del i förhöret.\n |
||||
TEXT: |
||||
"""{chunk}"""\n |
||||
Om det finns något i texten som har att göra med listan ovan så sammanfatta dessa uppgifter så detaljerat som möjligt. Det behöver inte vara konkret fakta, jag är även intressserad av anspelningar och rykten. |
||||
Var noga med detaljer som namn, platser, tider och händelser. Se också till att få med vem som är inblandad och hur och vem som sagt vad, samt hur personer är relaterade till varandra. |
||||
Lägg inte till någon egen information, fokusera bara på texten. |
||||
''' |
||||
self.sexual_info.append(self.llm.generate(prompt)) |
||||
|
||||
def collect_sexual_info(self): |
||||
|
||||
chunk = self.sexual_chunk |
||||
index_of_chunk = self.chunks.index(chunk) |
||||
if index_of_chunk != len(self.chunks)-1: |
||||
remaining_chunks = self.chunks[index_of_chunk+1:] |
||||
for chunk in remaining_chunks: |
||||
self.extract_sexual_info(chunk) |
||||
|
||||
sexual_info_string = '\n'.join(self.sexual_info) |
||||
prompt = f'Nedan är innehåll som samlats in ur förhöret:\n\n"""{sexual_info_string}"""\n\nSammanfatta innehållet på ett detaljerat vis.' |
||||
self.sexual_summary = self.llm.generate(prompt) |
||||
|
||||
def add_to_arango(self): |
||||
arango_doc = self.create_arango_doc() |
||||
|
||||
db.collection('rumors').insert(arango_doc, overwrite=True, keep_none=False) |
||||
|
||||
if arango_doc['sexual_content']: |
||||
pprint(arango_doc) |
||||
|
||||
|
||||
def process_interrogation(interrogation_data): |
||||
interrogation = Interrogation(interrogation_data) |
||||
|
||||
for chunk in interrogation.chunks: |
||||
interrogation.find_sexual_content(chunk) |
||||
|
||||
if not interrogation.sexual_content: |
||||
continue |
||||
if interrogation.sexual_content: |
||||
interrogation.find_self_experience() |
||||
if interrogation.self_experience: |
||||
interrogation.find_self_involvement() |
||||
if interrogation.self_involvement: |
||||
pass |
||||
else: |
||||
interrogation.find_heard_about() |
||||
else: |
||||
interrogation.find_heard_about() |
||||
|
||||
interrogation.collect_sexual_info() |
||||
interrogation.add_to_arango() |
||||
break |
||||
if not interrogation.sexual_content: |
||||
interrogation.add_to_arango() |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
db = arango.db |
||||
|
||||
q = 'for doc in interrogations return doc' |
||||
interrogations = list(db.aql.execute(q)) |
||||
|
||||
# Filter out interrogations that have their _key in the rumors collection |
||||
q = 'for rumor in rumors return rumor._key' |
||||
rumors = list(db.aql.execute(q)) |
||||
interrogations = [interrogation for interrogation in interrogations if interrogation['_key'] not in rumors] |
||||
print('Number of interrogations to process:', len(interrogations)) |
||||
|
||||
for i in interrogations: |
||||
process_interrogation(i) |
||||
exit() |
||||
with multiprocessing.Pool() as pool: |
||||
pool.map(process_interrogation, interrogations) |
||||
|
||||
@ -0,0 +1,108 @@ |
||||
from _chroma import ChromaDB |
||||
from _arango import arango |
||||
from _llm import LLM |
||||
from pprint import pprint |
||||
from print_color import * |
||||
import multiprocessing |
||||
|
||||
def add_persons_to_chroma(): |
||||
db = arango.db |
||||
q = "for doc in persons filter doc.other != true return doc" |
||||
persons = list(db.aql.execute(q)) |
||||
|
||||
# Lists to store the documents, metadatas and ids |
||||
documents = [] |
||||
metadatas = [] |
||||
ids = [] |
||||
|
||||
for person in persons: |
||||
if 'info' in person: |
||||
info = '\n'.join(person['info']) |
||||
documents.append(person['name']) |
||||
#documents.append(f"{person['name']}\n{info}") |
||||
metadata = {'name': person['name'], '_key': person['_key']} |
||||
metadatas.append(metadata) |
||||
ids.append(person["_key"]) |
||||
|
||||
collection = chroma.client.get_collection('mala_persons') |
||||
collection.add(documents=documents, metadatas=metadatas, ids=ids) |
||||
|
||||
|
||||
def find_person(person): |
||||
""" |
||||
Finds a person in the database based on the given person information. |
||||
|
||||
Args: |
||||
person (dict): A dictionary containing information about the person. |
||||
|
||||
Returns: |
||||
tuple: A tuple containing the following elements: |
||||
- answer (str): The generated answer from the language model. |
||||
- person_in_arango (dict): The information of the person retrieved from ArangoDB. |
||||
- interrogation (str): The ID of the interrogation where the person was mentioned. |
||||
""" |
||||
db = arango.db |
||||
llm = LLM() |
||||
|
||||
found = False |
||||
other_person = person['name'] |
||||
|
||||
chroma = ChromaDB() |
||||
col = chroma.client.get_or_create_collection('mala_persons') |
||||
hits = col.query(query_texts=[other_person], n_results=1) |
||||
|
||||
hit = hits['documents'][0][0] |
||||
_key = hits['metadatas'][0][0]['_key'] |
||||
distance = hits['distances'][0][0] |
||||
|
||||
#* Filter out hits with distance > 1 |
||||
if distance > 1: |
||||
return None |
||||
person_in_arango = db.collection('persons').get(_key) |
||||
info = '\n'.join(person_in_arango['info']) |
||||
|
||||
print('Other:', other_person, 'Förslag:', hit, 'Distance:', distance) |
||||
interrogations = person['mentioned_in_interrogation'] |
||||
|
||||
output = [] |
||||
for interrogation in interrogations: |
||||
text = db.collection('interrogations').get(interrogation)['text'] |
||||
prompt = f'''I texten nedan omnämns en "{other_person}" och jag försöker förstå om det kan vara exempelvis ett felstavat namn eller smeknamn för en annan person.\n |
||||
TEXT: |
||||
"""{text}"""\n |
||||
|
||||
På andra ställen i polisens förundersökning finns en person som heter "{hit}", och som beskrivs så här: |
||||
"""{info}"""\n |
||||
Verkar det troligt att personen som kallas {other_person} är samma person som {hit}? Svara bara JA eller NEJ, samt en kort förklaring till varför. |
||||
''' |
||||
# Om istället förnamnet eller efternamnet är helt olika så är det förmodligen inte samma person.Om det bara är ett namn (inget efternamn) kan det också handla om ett smeknamn eller en beskrivning. |
||||
answer = llm.generate(prompt) |
||||
|
||||
output.append((answer, person_in_arango, text)) |
||||
return output |
||||
|
||||
def verify(answer, person, person_in_arango, text): |
||||
found_input = input('Enter om det stämmer? >> ') |
||||
if found_input == '': |
||||
found = True |
||||
person['mentioned_in_interrogation'].remove(text) |
||||
db.collection('persons').update(person) |
||||
person_in_arango['info'].append(answer) |
||||
person_in_arango['mentioned_in_interrogation'].append(text) |
||||
db.collection('persons').update(person_in_arango) |
||||
if found: |
||||
db.collection('other_persons').insert(person, overwrite=True) |
||||
db.collection('persons').delete(person, check_rev=False) |
||||
|
||||
if __name__ == '__main__': |
||||
db = arango.db |
||||
persons = list(db.collection('persons').all()) |
||||
|
||||
q = 'for doc in persons filter doc.other == true return doc' |
||||
other_persons = [i for i in db.aql.execute(q)] |
||||
|
||||
for person in other_persons: |
||||
print(find_person(person)) |
||||
exit() |
||||
# with multiprocessing.Pool() as pool: |
||||
# pool.map(find_person, other_persons) |
||||
@ -0,0 +1,25 @@ |
||||
from _arango import arango |
||||
|
||||
db = arango.db |
||||
|
||||
|
||||
persons = list(db.aql.execute("for doc in persons filter doc.other != true return doc")) |
||||
|
||||
for person in persons: |
||||
person_id = person['_id'] |
||||
if 'mentioned_in_interrogation' in person: |
||||
for interrogation in person['mentioned_in_interrogation']: |
||||
interrogation = db.collection('interrogations').get(interrogation) |
||||
if 'mentioned_persons' not in interrogation: |
||||
interrogation['mentioned_persons'] = [] |
||||
if person_id not in interrogation['mentioned_persons']: |
||||
interrogation['mentioned_persons'].append(person_id) |
||||
print(interrogation['mentioned_persons'], interrogation['_key']) |
||||
db.collection('interrogations').update(interrogation, merge=False) |
||||
|
||||
interrogations = list(db.collection("interrogations").all()) |
||||
|
||||
for interrogation in interrogations: |
||||
if 'mentioned_persons' in interrogation: |
||||
interrogation['mentioned_persons'] = list(set(interrogation['mentioned_persons'])) |
||||
db.collection('interrogations').update(interrogation, merge=False) |
||||
@ -0,0 +1,11 @@ |
||||
def print_green(text): |
||||
print(f"\033[92m{text}\033[0m") |
||||
|
||||
def print_red(text): |
||||
print(f"\033[91m{text}\033[0m") |
||||
|
||||
def print_yellow(text): |
||||
print(f"\033[93m{text}\033[0m") |
||||
|
||||
def print_blue(text): |
||||
print(f"\033[94m{text}\033[0m") |
||||
@ -1,8 +1,19 @@ |
||||
from _arango import arango |
||||
|
||||
q = 'for doc in persons filter doc.interrogated == "Unknown" return doc' |
||||
# arango.db.collection("other_persons").truncate() |
||||
# arango.db.collection("all_relations").truncate() |
||||
|
||||
for person in arango.db.aql.execute(q): |
||||
arango.db.collection("persons").delete(person) |
||||
# persons = list(arango.db.collection("persons").all()) |
||||
|
||||
arango.db.collection("all_relations").truncate() |
||||
# for person in persons: |
||||
# for interrogation in person['interrogations']: |
||||
|
||||
# arango.db.collection('interrogations').update({'_key': interrogation, 'person_id': person['_id']}, ) |
||||
# print(f"Updated {interrogation} with person_id {person['_id']}") |
||||
|
||||
|
||||
interrogations = list(arango.db.collection("interrogations").all()) |
||||
|
||||
for interrogation in interrogations: |
||||
interrogation['person_id'] = 'persons/' + interrogation['person'].replace('persons_', '') |
||||
arango.db.collection('interrogations').update(interrogation, merge=False) |
||||
Loading…
Reference in new issue