commit
ef5a16870a
11 changed files with 999 additions and 0 deletions
@ -0,0 +1,49 @@ |
||||
import os |
||||
import re |
||||
#import pandas as pd |
||||
from arango import ArangoClient, exceptions |
||||
from arango.database import StandardDatabase |
||||
from dotenv import load_dotenv |
||||
|
||||
class ArangoDB: |
||||
def __init__(self, database=None): |
||||
""" |
||||
Initializes a connection to an ArangoDB database using the configuration |
||||
""" |
||||
|
||||
load_dotenv(".env") |
||||
|
||||
host = os.environ['ARANGO_HOSTS'] |
||||
if database: |
||||
db = database |
||||
else: |
||||
db = os.environ['ARANGO_DB'] |
||||
username = os.environ['ARANGO_USERNAME'] |
||||
pwd = os.environ['ARANGO_PWD_LASSE'] |
||||
|
||||
# Initialize the database for ArangoDB. |
||||
self.client: ArangoClient = ArangoClient(hosts=host) |
||||
self.db: StandardDatabase = self.client.db(db, username=username, password=pwd) |
||||
|
||||
def fix_key_name(self, string): |
||||
""" |
||||
Makes a string a valid ArangoDB key name. |
||||
|
||||
Args: |
||||
string (str): The string to fix. |
||||
|
||||
Returns: |
||||
str: The fixed string. |
||||
""" |
||||
string = string.replace("å", "a").replace('ä', 'a').replace('ö', 'o').replace('Å', 'A').replace('Ä', 'A').replace('Ö', 'O') |
||||
string = re.sub(r"[^a-zA-Z0-9_\_\-\.\@\(\)\+,=\;\$\!\*\'\%]", "_", string) |
||||
encoded_string = string.encode('utf-8') |
||||
if len(encoded_string) > 254: |
||||
string = encoded_string[:254].decode('utf-8', 'ignore') |
||||
return string |
||||
|
||||
arango = ArangoDB() |
||||
|
||||
if __name__ == '__main__': |
||||
arango = ArangoDB() |
||||
print(arango.db) |
||||
@ -0,0 +1,264 @@ |
||||
from time import sleep |
||||
import requests |
||||
import concurrent.futures |
||||
import queue |
||||
import threading |
||||
from _arango import arango |
||||
|
||||
|
||||
class LLM: |
||||
def __init__( |
||||
self, |
||||
chat=False, |
||||
model="llama3:8b-instruct-q5_K_M", |
||||
keep_alive=3600 * 24, |
||||
start=False, |
||||
): |
||||
""" |
||||
Initializes an instance of MyClass. |
||||
|
||||
Args: |
||||
chat (bool, optional): Specifies whether the instance is for chat purposes. Defaults to False. |
||||
model (str, optional): The model to be used. Defaults to "llama3:8b-instruct-q5_K_M". |
||||
keep_alive (int, optional): The duration in seconds to keep the instance alive. Defaults to 3600*24. |
||||
start (bool, optional): If True, the instance will automatically start processing requests upon initialization. |
||||
This means that a separate thread will be started that runs the generate_concurrent method, |
||||
which processes requests concurrently. Defaults to False. |
||||
""" |
||||
|
||||
self.server = "192.168.1.12" |
||||
self.port = 3300 # 11440 All 4 GPU # 4500 "SW" |
||||
self.model = model |
||||
self.temperature = 0 |
||||
self.system_message = 'Svara alltid på svenska. Svara bara på det som efterfrågas. Om du inte kan svara, skriv "Jag vet inte".' |
||||
self.messages = [{"role": "system", "content": self.system_message}] |
||||
self.chat = chat |
||||
self.max_tokens = 24000 |
||||
self.keep_alive = keep_alive |
||||
self.request_queue = queue.Queue() |
||||
self.result_queue = queue.Queue() |
||||
self.all_requests_added_event = threading.Event() |
||||
self.all_results_processed_event = threading.Event() |
||||
self.stop_event = threading.Event() |
||||
|
||||
if start: |
||||
self.start() |
||||
|
||||
def generate(self, message): |
||||
|
||||
# Prepare the request data |
||||
options = { |
||||
"temperature": self.temperature, |
||||
} |
||||
|
||||
if self.chat: |
||||
self.build_message(message) |
||||
messages = self.messages |
||||
else: |
||||
self.messages.append({"role": "user", "content": message}) |
||||
messages = self.messages |
||||
|
||||
data = { |
||||
"model": self.model, |
||||
"messages": messages, |
||||
"options": options, |
||||
"keep_alive": self.keep_alive, |
||||
"stream": False, |
||||
} |
||||
|
||||
# Make a POST request to the API endpoint |
||||
result = requests.post( |
||||
f"http://{self.server}:{self.port}/api/chat", json=data |
||||
).json() |
||||
|
||||
if "message" in result: |
||||
answer = result["message"]["content"] |
||||
else: |
||||
from pprint import pprint |
||||
pprint(result) |
||||
raise Exception("Error occurred during API request") |
||||
|
||||
if self.chat: |
||||
self.messages.append({"role": "assistant", "content": answer}) |
||||
|
||||
return answer |
||||
|
||||
def generate_concurrent( |
||||
self, |
||||
request_queue, |
||||
result_queue, |
||||
all_requests_added_event, |
||||
all_results_processed_event, |
||||
): |
||||
self.chat = False |
||||
with concurrent.futures.ThreadPoolExecutor() as executor: |
||||
future_to_message = {} |
||||
buffer_size = 6 # The number of tasks to keep in the executor |
||||
while True: |
||||
if self.stop_event.is_set(): |
||||
break |
||||
try: |
||||
# If there are less than buffer_size tasks being processed, add new tasks |
||||
while len(future_to_message) < buffer_size: |
||||
# Take a request from the queue |
||||
doc_id, message = request_queue.get(timeout=1) |
||||
# Submit the generate method to the executor for execution |
||||
future = executor.submit(self.generate, message) |
||||
future_to_message[future] = doc_id |
||||
except queue.Empty: |
||||
# If the queue is empty and all requests have been added, break the loop |
||||
if all_requests_added_event.is_set(): |
||||
break |
||||
else: |
||||
continue |
||||
|
||||
# Process completed futures |
||||
done_futures = [f for f in future_to_message if f.done()] |
||||
for future in done_futures: |
||||
doc_id = future_to_message.pop(future) |
||||
try: |
||||
summary = future.result() |
||||
except Exception as exc: |
||||
print("Document %r generated an exception: %s" % (doc_id, exc)) |
||||
else: |
||||
# Put the document ID and the summary into the result queue |
||||
result_queue.put((doc_id, summary)) |
||||
|
||||
all_results_processed_event.set() |
||||
def start(self): |
||||
# Start a separate thread that runs the generate_concurrent method |
||||
threading.Thread( |
||||
target=self.generate_concurrent, |
||||
args=( |
||||
self.request_queue, |
||||
self.result_queue, |
||||
self.all_requests_added_event, |
||||
self.all_results_processed_event, |
||||
), |
||||
).start() |
||||
|
||||
def stop(self): |
||||
""" |
||||
Stops the instance from processing further requests. |
||||
""" |
||||
self.stop_event.set() |
||||
|
||||
def add_request(self, id, prompt): |
||||
# Add a request to the request queue |
||||
self.request_queue.put((id, prompt)) |
||||
|
||||
def finish_adding_requests(self): |
||||
# Signal that all requests have been added |
||||
print("\033[92mAll requests added\033[0m") |
||||
self.all_requests_added_event.set() |
||||
|
||||
def get_results(self): |
||||
# Process the results |
||||
while True: |
||||
try: |
||||
# Take a result from the result queue |
||||
doc_id, summary = self.result_queue.get(timeout=1) |
||||
return doc_id, summary |
||||
|
||||
except queue.Empty: |
||||
# If the result queue is empty and all results have been processed, break the loop |
||||
if self.all_results_processed_event.is_set(): |
||||
break |
||||
else: |
||||
sleep(0.2) |
||||
continue |
||||
|
||||
def build_message(self, message): |
||||
# Add the new message to the list |
||||
self.messages.append({"role": "user", "content": message}) |
||||
|
||||
# Calculate the total token length of the messages |
||||
total_tokens = sum([len((msg["content"])) for msg in self.messages]) |
||||
|
||||
# While the total token length exceeds the limit, remove the oldest messages |
||||
while total_tokens > self.max_tokens: |
||||
removed_message = self.messages.pop( |
||||
1 |
||||
) # Remove the oldest message (not the system message) |
||||
total_tokens -= len((removed_message["content"])) |
||||
|
||||
def unload_model(self): |
||||
data = { |
||||
"model": self.model, |
||||
"messages": self.messages, |
||||
"keep_alive": 0, |
||||
"stream": False, |
||||
} |
||||
|
||||
# Make a POST request to the API endpoint |
||||
requests.post(f"http://{self.server}:{self.port}/api/chat", json=data).json()[ |
||||
"message" |
||||
]["content"] |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
# Initialize the LLM object |
||||
llm = LLM(chat=False, model="llama3:8b-instruct-q5_K_M") |
||||
|
||||
# Create a queue for requests and a queue for results |
||||
request_queue = queue.Queue() |
||||
result_queue = queue.Queue() |
||||
|
||||
# Create an event to signal when all requests have been added |
||||
all_requests_added_event = threading.Event() |
||||
all_results_processed_event = threading.Event() |
||||
|
||||
# Start a separate thread that runs the generate_concurrent method |
||||
threading.Thread( |
||||
target=llm.generate_concurrent, |
||||
args=( |
||||
request_queue, |
||||
result_queue, |
||||
all_requests_added_event, |
||||
all_results_processed_event, |
||||
), |
||||
).start() |
||||
|
||||
# Add requests to the request queue |
||||
from _arango import arango |
||||
|
||||
interrogations = arango.db.collection("interrogations").all() |
||||
for doc in interrogations: |
||||
text = doc["text"] |
||||
prompt = f'Kolla på texten nedan: \n\n """{text}""" \n\n Sammanfatta förhöret med fokus på vad som sades, inte var det hölls eller annat formalia. Svara så kort som möjligt men var noga med detaljer som händelser som beskrivs, namn, datum och platser.\nKort sammanfattning:' |
||||
request_queue.put((doc["_key"], prompt)) |
||||
|
||||
# Signal that all requests have been added |
||||
all_requests_added_event.set() |
||||
|
||||
# Process the results |
||||
while True: |
||||
try: |
||||
# Take a result from the result queue |
||||
doc_id, summary = result_queue.get(timeout=1) |
||||
print("\033[92m" + doc_id + "\033[0m", summary) |
||||
# Update the document with the summary |
||||
arango.db.collection("interrogations").update_match( |
||||
{"_key": doc_id}, {"summary": summary} |
||||
) |
||||
except queue.Empty: |
||||
# If the result queue is empty and all results have been processed, break the loop |
||||
if all_results_processed_event.is_set(): |
||||
break |
||||
else: |
||||
continue |
||||
|
||||
# import argparse |
||||
# parser = argparse.ArgumentParser() |
||||
# parser.add_argument("--unload", action="store_true", help="Unload the model") |
||||
# args = parser.parse_args() |
||||
|
||||
# #llm = LLM(model='llama3:70b-text-q4_K_M', keep_alive=6000, chat=True) |
||||
# llm = LLM(keep_alive=60, chat=True) |
||||
|
||||
# if args.unload: |
||||
# llm.unload_model() |
||||
# else: |
||||
# while True: |
||||
# message = input(">>> ") |
||||
# print(llm.generate(message)) |
||||
@ -0,0 +1,176 @@ |
||||
from _llm import LLM |
||||
import fitz |
||||
from _arango import arango |
||||
from openai import OpenAI |
||||
from pprint import pprint |
||||
|
||||
|
||||
def extract_interrogation(text): |
||||
|
||||
interrogated = llm.generate( |
||||
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vem är förhörd? Svara på formen "Förnamn Efternamn" \n\nFörhörd person:' |
||||
) |
||||
interrogated_role = llm.generate( |
||||
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vem är {interrogated}? \n\nTitel på förhörd person:' |
||||
) |
||||
interrogation_topic = llm.generate( |
||||
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vad handlade förhöret om? Svara så kortfattat som möjligt. \n\nFörhörets syfte:' |
||||
) |
||||
interrogation_date = llm.generate( |
||||
f'Kolla på texten nedan: \n\n """{text}""" \n\n När ägde förhöret rum? Svara på formen YYY-MM-DD \n\nFörhörsdatum:' |
||||
) |
||||
|
||||
print(f"Förhörd: {interrogated}") |
||||
print(f"Förhörd roll: {interrogated_role}") |
||||
print(f"Förhörets syfte: {interrogation_topic}") |
||||
print(f"Förhörsdatum: {interrogation_date}") |
||||
|
||||
if not arango.db.has_document( |
||||
"interrogations/" |
||||
+ arango.fix_key_name(f"{interrogated}_{interrogation_date}_p.{page.number}") |
||||
): |
||||
interrogation_key = arango.fix_key_name( |
||||
f"{interrogated}_{interrogation_date}_p.{page.number}" |
||||
) |
||||
|
||||
arango.db.collection("interrogations").insert( |
||||
{ |
||||
"_key": interrogation_key, |
||||
"interrogated": interrogated, |
||||
"role": interrogated_role, |
||||
"topic": interrogation_topic, |
||||
"date": interrogation_date, |
||||
"page": page.number, |
||||
"text": text, |
||||
"filename": filename, |
||||
} |
||||
) |
||||
|
||||
else: |
||||
interrogation_key = arango.fix_key_name( |
||||
f"{interrogated}_{interrogation_date}_p.{page.number}" |
||||
) |
||||
|
||||
return ( |
||||
interrogation_key, |
||||
interrogated, |
||||
interrogated_role, |
||||
interrogation_topic, |
||||
interrogation_date, |
||||
) |
||||
|
||||
|
||||
def extract_relations(text, interrogated_person): |
||||
prompt = f'''Nedan är en del av ett förhör med {interrogated_person}. Jag vill veta vilka relationer som på något vis nämns i texten. Dessa kan vara mellan {interrogated_person} och någon annan, mellan två personer som {interrogated_person} berättar om eller mellan en person och en organisation/plats. |
||||
Svara på formen "person1;person2;relation\n". Om det inte finns någon relation, svara med None. |
||||
|
||||
Nedan är ett exempel för att du ska förstå hur du ska svara: |
||||
<EXEMPEL> |
||||
Text: """En solig dag promenerade Anna längs med stadens livliga gator. Plötsligt stannade hon upp när hon såg en bekant gestalt längre fram. Med ett leende gick hon fram och hälsade på personen, och de inledde en trevlig konversation. Ju mer de pratade, desto fler minnen från barndomen väcktes till liv. Till slut insåg de att de faktiskt var gamla klasskompisar från högstadiet. Skratten ekade när de mindes tokiga stunder och gemensamma vänner. Det var en oväntad men glädjande återförening mitt i vardagens trummer.""" |
||||
Relationer: Anna;Peter;klasskompisar från högstadiet\n |
||||
</EXEMPEL> |
||||
|
||||
Text: """{text}"""\n |
||||
|
||||
Svara ENBART med relationerna, inga förklaringar eller exempel eller något annat. Kom ihåg att svara på formen "person1;person2;relation\n". |
||||
Relationer:''' |
||||
|
||||
return llm.generate(prompt) |
||||
|
||||
|
||||
# * OpenAI |
||||
OPENAI_KEY = "sk-proj-lDgKqh9eTLpbuSEaR69XT3BlbkFJsw0QkuXuZmf08mt9X76h" |
||||
client = OpenAI( |
||||
# This is the default and can be omitted |
||||
api_key=OPENAI_KEY, |
||||
) |
||||
|
||||
# * Llama |
||||
llm = LLM(chat=False, model="llama3:8b-instruct-q5_K_M") |
||||
|
||||
# To check if the interrogation has been found |
||||
interrogation = False |
||||
|
||||
# Open the PDF file |
||||
filename = "Förhörsprotokoll.pdf" |
||||
doc = fitz.open(f"pdfs/{filename}") |
||||
|
||||
for page in doc: |
||||
text = page.get_text() |
||||
|
||||
control_words = [ |
||||
"Förhörsdatum", |
||||
"Förhör påbörjat", |
||||
"Förhör avslutat", |
||||
"Förhörssätt", |
||||
"Typ av förhör", |
||||
"Förhörsvittne", |
||||
] |
||||
n_control_words = 0 |
||||
for word in control_words: |
||||
if word in text: |
||||
n_control_words += 1 |
||||
|
||||
if n_control_words >= 2: |
||||
print("\n\n") |
||||
interrogation = True |
||||
( |
||||
interrogation_key, |
||||
interrogated, |
||||
interrogated_role, |
||||
interrogation_topic, |
||||
interrogation_date, |
||||
) = extract_interrogation(text) |
||||
|
||||
if not interrogation: |
||||
continue |
||||
|
||||
# Extract relations from the page |
||||
relations = extract_relations(text, interrogated_person=interrogated) |
||||
|
||||
for i in relations.split("\n"): |
||||
if i == "None": |
||||
continue |
||||
relation_parts = i.split(";") |
||||
person1 = relation_parts[0] |
||||
person2 = relation_parts[1] |
||||
relation = " - ".join(relation_parts[2:]) |
||||
prompt = f'Kolla på texten nedan: \n\n """{text}""" \n\n Vilken del av texten beskriver relationen "{relation}" mellan {person1} och {person2}? Svara med den ORDAGRANNA texten. \n\nDel av text:' |
||||
relation_text = llm.generate(prompt) |
||||
arango_document = { |
||||
"_key": arango.fix_key_name( |
||||
f"{person1}_{person2}_{relation}_p.{page.number}" |
||||
), |
||||
"person1": person1, |
||||
"person2": person2, |
||||
"relation": relation, |
||||
"page": page.number, |
||||
"text": relation_text, |
||||
"filename": filename, |
||||
"iterrogation": interrogation_key, |
||||
"interrogated_person": interrogated, |
||||
"interrogation_date": interrogation_date, |
||||
} |
||||
|
||||
pprint(arango_document) |
||||
arango.db.collection("relations").insert( |
||||
arango_document, silent=True, overwrite=True |
||||
) |
||||
print("\n") |
||||
|
||||
# result = client.chat.completions.create( |
||||
# messages=[ |
||||
# { |
||||
# "role": "user", |
||||
# "content": prompt, |
||||
# } |
||||
# ], |
||||
# model="gpt-4", |
||||
# ) |
||||
# print(result) |
||||
# answer = result.choices[0].message.content |
||||
# print(answer) |
||||
|
||||
# print('\n\n') |
||||
# print(f"\033[92m{answer}\033[0m") |
||||
# exit() |
||||
@ -0,0 +1,148 @@ |
||||
from _llm import LLM |
||||
import fitz |
||||
from _arango import arango |
||||
from openai import OpenAI |
||||
from pprint import pprint |
||||
|
||||
class Section: |
||||
def __init__(self, type, page, filename="Förhörsprotokoll.pdf"): |
||||
self.type = type |
||||
self.filename = filename |
||||
self.text = "" |
||||
self.person = "" |
||||
self.role = "" |
||||
self.topic = "" |
||||
self.date = "" |
||||
self.start_page = page |
||||
self.key = "" |
||||
|
||||
def add_to_arango(self): |
||||
|
||||
key = arango.fix_key_name( |
||||
f"{self.person}_{self.date}_p.{self.start_page}" |
||||
) |
||||
arango_doc = { |
||||
"_key": key, |
||||
"person": self.person, |
||||
"role": self.role, |
||||
"topic": self.topic, |
||||
"date": self.date, |
||||
"page": self.start_page, |
||||
"text": self.text, |
||||
"filename": self.filename, |
||||
} |
||||
arango.db.collection(self.type).insert(arango_doc, overwrite=True) |
||||
print(f"Added {self.type} to ArangoDB with key {key}") |
||||
|
||||
|
||||
def extract_interrogation(self, text): |
||||
self.person = llm.generate( |
||||
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vem är förhörd? Svara på formen "Förnamn Efternamn" \n\nFörhörd person:' |
||||
) |
||||
self.role = llm.generate( |
||||
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vem är {self.person}? \n\nTitel på förhörd person:' |
||||
) |
||||
self.topic = llm.generate( |
||||
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vad handlade förhöret om? Svara så kortfattat som möjligt. \n\nFörhörets syfte:' |
||||
) |
||||
self.date = llm.generate( |
||||
f'Kolla på texten nedan: \n\n """{text}""" \n\n När ägde förhöret rum? Svara på formen YYY-MM-DD \n\nFörhörsdatum:' |
||||
) |
||||
|
||||
|
||||
self.key = arango.fix_key_name(f"{self.person}_{self.date}_p.{self.start_page}") |
||||
|
||||
|
||||
def extract_pm(self, text): |
||||
self.person = llm.generate( |
||||
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vem är uppgiftslämnare? Svara på formen "Förnamn Efternamn" \n\nPM:' |
||||
) |
||||
self.role = llm.generate( |
||||
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vem är {self.person}? Svara "None" om det inte framgår. \n\nTitel på person:' |
||||
) |
||||
self.topic = llm.generate( |
||||
f'Kolla på texten nedan: \n\n """{text}""" \n\n Vad handlade informationen om? Svara så kortfattat som möjligt. Svara "None" om det inte framgår. \n\Svar:' |
||||
) |
||||
self.date = llm.generate( |
||||
f'Kolla på texten nedan: \n\n """{text}""" \n\n När lämnades informationen? Svara på formen YYY-MM-DD \n\nDatum:' |
||||
) |
||||
self.key = arango.fix_key_name(f"{self.person}_{self.date}_p.{self.start_page}") |
||||
|
||||
|
||||
def new_interrogation(page, section): |
||||
|
||||
if section.text != "": |
||||
section.add_to_arango() |
||||
section = Section("interrogations", page.number) |
||||
section.extract_interrogation(page.get_text()) |
||||
return section |
||||
|
||||
|
||||
def new_pm(page, section): |
||||
if section.text != "": |
||||
section.add_to_arango() |
||||
section = Section("pms", page.number) |
||||
section.extract_interrogation(page.get_text()) |
||||
return section |
||||
|
||||
|
||||
|
||||
# * Llama |
||||
llm = LLM(chat=False, model="llama3:8b-instruct-q5_K_M") |
||||
|
||||
# Open the PDF file |
||||
filename = "Förhörsprotokoll.pdf" |
||||
area = fitz.Rect(0, 40, 520, 800) # To exlude the header |
||||
|
||||
doc = fitz.open(f"pdfs/{filename}") |
||||
|
||||
section = Section("interrogations", 0) |
||||
|
||||
|
||||
for page in doc.pages(9, len(doc) - 1): |
||||
|
||||
# Get the text from the page |
||||
page_text = page.get_text("text") |
||||
|
||||
# Check if there is a new interrogation |
||||
control_words_interrogation = [ |
||||
"Förhörsdatum", |
||||
"Förhör påbörjat", |
||||
"Förhör avslutat", |
||||
"Förhörssätt", |
||||
"Typ av förhör", |
||||
"Förhörsvittne", |
||||
] |
||||
n_control_words_interrogation = 0 |
||||
for word in control_words_interrogation: |
||||
if word in page_text: |
||||
n_control_words_interrogation += 1 |
||||
if n_control_words_interrogation >= 2: |
||||
section = new_interrogation(page, section) |
||||
area = fitz.Rect(0, 400, 520, 800) |
||||
|
||||
else: |
||||
# Check if there is a new PM |
||||
control_words_pm = [ |
||||
"PM", |
||||
"Uppgiften avser", |
||||
"Upprättad av", |
||||
"Sätt på vilket uppgift lämnats", |
||||
"Uppgiftslämnare", |
||||
] |
||||
n_control_words_pm = 0 |
||||
for word in control_words_pm: |
||||
if word in page_text: |
||||
n_control_words_pm += 1 |
||||
if n_control_words_pm >= 2: |
||||
area = fitz.Rect(0, 400, 520, 800) |
||||
section = new_pm(page, section) |
||||
|
||||
|
||||
else: |
||||
# It's a "normal" page |
||||
area = fitz.Rect(0, 40, 520, 800) # To exlude the header |
||||
|
||||
blocks = page.get_text("blocks", clip=area) |
||||
for block in blocks: |
||||
section.text += block[4] + "\n\n" |
||||
@ -0,0 +1,24 @@ |
||||
import fitz |
||||
from pprint import pprint |
||||
from _arango import arango |
||||
from _llm import LLM |
||||
|
||||
llm = LLM(chat=False) |
||||
docs = [i for i in arango.db.collection('interrogations').all()] |
||||
sorted_docs = sorted(docs, key=lambda x: x['date']) |
||||
|
||||
filename = "Förhörsprotokoll.pdf" |
||||
pdf = fitz.open(f"pdfs/{filename}") |
||||
|
||||
for doc in sorted_docs: |
||||
|
||||
pdf_page = pdf[doc['page']] |
||||
text = pdf_page.get_text() |
||||
|
||||
print(doc['person']) |
||||
prompt = f'Kolla på texten nedan: \n\n """{text}""" \n\n Den förhörda personen heter {doc["person"]} Varför förhörs {doc["person"]}? Om det har något att göra med {doc["person"]}s titel eller yrke, svara med det, annars med eventuell annan anledning. Om det inte finns någon speciell anledning eller titel, svara "None". \n\:' |
||||
answer = llm.generate(prompt) |
||||
doc['reason'] = answer |
||||
print("\033[92m" + answer + "\033[0m") |
||||
print() |
||||
arango.db.collection('interrogations').update(doc) |
||||
@ -0,0 +1,119 @@ |
||||
from _arango import arango |
||||
from _llm import LLM |
||||
|
||||
from pprint import pprint |
||||
from langchain_text_splitters import CharacterTextSplitter |
||||
|
||||
# Create an instance of the SentenceSplitter |
||||
text_splitter = CharacterTextSplitter( |
||||
separator="\n\n", |
||||
chunk_size=8000, |
||||
chunk_overlap=0, |
||||
length_function=len, |
||||
is_separator_regex=False, |
||||
) |
||||
|
||||
llm = LLM(chat=False) |
||||
|
||||
interrogations = [i for i in arango.db.collection("interrogations").all()] |
||||
interrogations = sorted(interrogations, key=lambda x: x["date"]) |
||||
|
||||
|
||||
for interrogation in interrogations: |
||||
|
||||
# Get the persons (now updated from the last one, so it should be all persons in the database) |
||||
persons_docs = [i for i in arango.db.collection("persons").all()] |
||||
persons = [i["name"] for i in persons_docs] |
||||
persons_dict = {i["name"]: i for i in persons_docs} |
||||
persons_string = "\n".join(persons) |
||||
|
||||
interrogated_person = interrogation["person"] |
||||
if interrogation["person"] in persons: |
||||
doc = persons_dict[interrogated_person] |
||||
_from = doc["_id"] |
||||
text = interrogation["text"] |
||||
chunks = text_splitter.split_text(text) |
||||
for chunk in chunks: |
||||
prompt = f'''' |
||||
Kolla på texten nedan: \n\ |
||||
TEXT: |
||||
"""{chunk}""" \n |
||||
{interrogation["person"]} förhörs. Nämns några personer i listan nedan i själva förhöret? \n |
||||
LISTA PÅ PERSONER: |
||||
{persons_string}\n |
||||
I texten kan en person nämnas med sitt fulla namn, men oftast bara förnamn eller efternamn. |
||||
Svara med fullständiga namn från listan och hur personen nämns i texten på formen "namn;hur personen nämns\n". |
||||
Nedan är ett exempel för att du ska förstå hur du ska svara: |
||||
|
||||
<EXEMPEL> |
||||
John Lundqvist;John |
||||
Karl Renström; Karl |
||||
</EXEMPEL> |
||||
|
||||
Svara ENBART med personens namn och hur det nämns, på formen "namn;hur personen nämns\n". Svara inte med något resonemang, och enbart med personer som nämns. Om ingen person från listan nämns, svara med None. |
||||
\nPersoner:''' |
||||
|
||||
relations = llm.generate(prompt) |
||||
|
||||
for relation in relations.replace('*', '').split("\n"): |
||||
if relation == "None": |
||||
continue |
||||
try: |
||||
name, mention = relation.split(";", 1) |
||||
except ValueError: |
||||
print("\033[91m" + relation + "\033[0m") |
||||
continue |
||||
|
||||
if name in persons: |
||||
doc = persons_dict[name] |
||||
_to = doc["_id"] |
||||
|
||||
else: |
||||
|
||||
name_parts = name.split(" ") |
||||
if ' ' in name and f'{name_parts[1]} {name_parts[0]}' in persons: |
||||
doc = persons_dict[f'{name_parts[1]} {name_parts[0]}'] |
||||
_to = doc["_id"] |
||||
else: |
||||
arango_doc = { |
||||
"name": name, |
||||
"_key": arango.fix_key_name(name), |
||||
"interrogated": "Unknown", |
||||
} |
||||
# pprint(mention_context) |
||||
# add = input(f"Add {name} ({mention}) to database? (y/n) >> ") |
||||
# if add in ["y", ""]: |
||||
if arango.fix_key_name(name) not in arango.db.collection("persons"): |
||||
arango.db.collection("persons").insert(arango_doc) |
||||
doc = arango.db.collection("persons").get(arango_doc["_key"]) |
||||
_to = doc["_id"] |
||||
else: |
||||
doc = arango.db.collection("persons").get(arango_doc["_key"]) |
||||
_to = doc["_id"] |
||||
if _from == _to: |
||||
continue |
||||
|
||||
relation_key = arango.fix_key_name(f"{_from}_{_to}__{interrogation['_key']}").replace("persons_", "") |
||||
if arango.db.has_document("all_relations/" + relation_key): |
||||
continue |
||||
|
||||
# Ask LLM about the context of the mention |
||||
prompt = f'Nedan är en del av ett förhör med {interrogated_person}.\n\n"""{chunk}"""\n\n{interrogated_person} nämner i förhöret en person vid namn {name}. Exakt vad säger {interrogated_person} om {name}? Svara så kortfattat som möjligt.\n\nSvar:' |
||||
mention_context = llm.generate(prompt) |
||||
arango_doc = { |
||||
"_key": relation_key, |
||||
"_from": _from, |
||||
"_to": _to, |
||||
"in": interrogation["_id"], |
||||
"context": "interrogation", |
||||
"mentioned_as": mention, |
||||
"mention": mention_context, |
||||
} |
||||
|
||||
print("\033[92m" + f'{_from} -> {_to}' + "\033[0m") |
||||
print(mention_context) |
||||
print() |
||||
arango.db.collection("all_relations").insert( |
||||
arango_doc, |
||||
overwrite=True, |
||||
) |
||||
@ -0,0 +1,110 @@ |
||||
import difflib |
||||
from _arango import arango |
||||
# filename = "Huvudprotokoll.pdf" |
||||
|
||||
# doc = fitz.open(f"pdfs/{filename}") |
||||
|
||||
# def group_words_by_y(words, tolerance=2): |
||||
# # Sort the words by their y-coordinate |
||||
# words.sort(key=lambda word: word[1]) |
||||
|
||||
# # Group the words by their rounded y-coordinate |
||||
# grouped_words = itertools.groupby(words, key=lambda word: round(word[1] / tolerance)) |
||||
|
||||
# # Sort the words in each group by their x-coordinate and combine the text |
||||
# combined_words = [' '.join(word[4] for word in sorted(group, key=lambda word: word[0])) for _, group in grouped_words] |
||||
|
||||
# return combined_words |
||||
|
||||
# append = False |
||||
# for page in doc.pages(1,3): |
||||
# words = [] |
||||
# text_words = page.get_text('words', sort=True) |
||||
# for word in text_words: |
||||
# if append: |
||||
# words.append(word) |
||||
# if word[4] == "Brottsplatsadress": |
||||
# append = True |
||||
# combined_words = group_words_by_y(words, tolerance=5) |
||||
# for word in combined_words: |
||||
# last_space_index = word.rfind(' ') |
||||
# if last_space_index != -1: |
||||
# first_part = word[:last_space_index] |
||||
# if ',' in first_part: |
||||
# word_parts = first_part.split(',') |
||||
# first_part = word_parts[1].strip() + ' ' + word_parts[0].strip() |
||||
# second_part = word[last_space_index+1:] |
||||
# else: |
||||
# first_part = word |
||||
# second_part = '' |
||||
# print(first_part.strip(), ';', second_part.strip()) |
||||
|
||||
# Take the output and clean it up in Excel |
||||
|
||||
|
||||
data = [ |
||||
{"name": "Carl-William Ahlqvist", "role": "Misstänkt"}, |
||||
{"name": "Elias David Ahlqvist", "role": "Vittne"}, |
||||
{"name": "Marlene Linnea Ahlqvist", "role": "Misstänkt"}, |
||||
{"name": "Jhonny Kaj lngemund Backman", "role": "Vittne"}, |
||||
{"name": "Louise Solveig Karin Bengtsson", "role": "Vittne"}, |
||||
{"name": "Ove Robert Greger Bengtsson", "role": "Misstänkt"}, |
||||
{"name": "Björn Willy Johnny Borell", "role": "Vittne"}, |
||||
{"name": "Lars Victor Bystedt", "role": "Vittne"}, |
||||
{"name": "Svea Helena Caroline Enberg", "role": "Vittne"}, |
||||
{"name": "Agnes Marie Hällgren", "role": "Vittne"}, |
||||
{"name": "Anna Jessica Maria Höglund", "role": "Vittne"}, |
||||
{"name": "Kent Åke Höglund", "role": "Vittne"}, |
||||
{"name": "Dan Anton Tobias Johansson", "role": "Vittne"}, |
||||
{"name": "Fredrik Max Johansson", "role": "Vittne"}, |
||||
{"name": "Ivar Emanuel Johansson", "role": "Målsägande"}, |
||||
{"name": "Rut Marit Beatrice Johansson", "role": "Målsägande"}, |
||||
{"name": "Lars Anders Markus Karlsson", "role": "Vittne"}, |
||||
{"name": "Eija Inkeri Kjäll", "role": "Vittne"}, |
||||
{"name": "Neo Arvid Magnus Larsson", "role": "Vittne"}, |
||||
{"name": "Lena Marie Susann Lind", "role": "Vittne"}, |
||||
{"name": "Elin Linnea Maria Lindell", "role": "Vittne"}, |
||||
{"name": "Sofi Teresia Lindwall", "role": "Vittne"}, |
||||
{"name": "Lars Thorbjöm Lundgren", "role": "Vittne"}, |
||||
{"name": "Fredrik Lars Lundmark", "role": "Vittne"}, |
||||
{"name": "Lars-Erik Mikael Molin", "role": "Vittne"}, |
||||
{"name": "Per Lars-Erik Molin", "role": "Vittne"}, |
||||
{"name": "Robin Alex Nieminen", "role": "Misstänkt"}, |
||||
{"name": "Malin Charlotta Nyström", "role": "Vittne"}, |
||||
{"name": "Ola Folke Magnus Pålsson", "role": "Vittne"}, |
||||
{"name": "Anna Margareta Renlund", "role": "Vittne"}, |
||||
{"name": "Karl Emanuel Renström", "role": "Vittne"}, |
||||
{"name": "Karl Henrik Sjölund", "role": "Vittne"}, |
||||
{"name": "Sven Bertil Stenberg", "role": "Vittne"}, |
||||
{"name": "BemdtPatrik Svahn", "role": "Vittne"}, |
||||
{"name": "Nea Christina Vänstedt", "role": "Vittne"}, |
||||
{"name": "Ola Nils Vänstedt", "role": "Vittne"}, |
||||
{"name": "Ulf Peder Öhman", "role": "Vittne"} |
||||
] |
||||
|
||||
persons = {i['name']: i['role'] for i in data} |
||||
|
||||
list_of_names = [i['name'] for i in data] |
||||
|
||||
interrogations = arango.db.collection('interrogations').all() |
||||
|
||||
for doc in interrogations: |
||||
most_similar_name = None |
||||
most_similar_names = difflib.get_close_matches(doc['person'], list_of_names, n=2) |
||||
for name in most_similar_names: |
||||
doc_names = set(doc['person'].split()) |
||||
name_parts = set(name.split()) |
||||
if doc_names.issubset(name_parts): |
||||
most_similar_name = name |
||||
break |
||||
|
||||
|
||||
if not most_similar_name: |
||||
doc['role'] = None |
||||
print("\033[91m" + doc['person'] + "\033[0m") |
||||
else: |
||||
doc['role'] = persons[most_similar_name] |
||||
print("\033[92m" + doc['person'] + "\033[0m") |
||||
doc['full_name'] = most_similar_name |
||||
|
||||
arango.db.collection('interrogations').update(doc, keep_none=False) |
||||
@ -0,0 +1,23 @@ |
||||
from _arango import arango |
||||
from _llm import LLM |
||||
|
||||
interrogations = [i for i in arango.db.collection("interrogations").all()] |
||||
|
||||
|
||||
for person in interrogations: |
||||
if "full_name" not in person: |
||||
person["full_name"] = None |
||||
if 'role' not in person: |
||||
person['role'] = None |
||||
arango_doc = { |
||||
"_key": arango.fix_key_name(person["person"]), |
||||
"name": person["person"], |
||||
"role": person["role"], |
||||
"reason_for_interrogation": [person["topic"]], |
||||
"full_name": person["full_name"], |
||||
"interrogation_date": [person["date"]], |
||||
"interrogations": [person["_key"]], |
||||
} |
||||
arango.db.collection("persons").insert( |
||||
arango_doc, overwrite_mode="update", merge=True, keep_none=False |
||||
) |
||||
@ -0,0 +1,11 @@ |
||||
from _arango import arango |
||||
from _llm import LLM |
||||
|
||||
llm = LLM(keep_alive=6000, chat=False) |
||||
|
||||
q = 'for doc in interrogations filter doc.reason != null return doc' |
||||
docs = [i for i in arango.db.aql.execute(q)] |
||||
|
||||
for doc in docs: |
||||
print("\033[92m", doc['person'], "\033[0m", doc['reason']) |
||||
|
||||
@ -0,0 +1,67 @@ |
||||
import queue |
||||
import threading |
||||
from _arango import arango |
||||
from _llm import LLM |
||||
|
||||
|
||||
# Initialize and start the LLM object |
||||
llm = LLM(start=True) |
||||
|
||||
# Add requests to the LLM object |
||||
q = 'for doc in interrogations filter doc.summary == null return doc' |
||||
docs = [i for i in arango.db.aql.execute(q)] |
||||
print(len(docs)) |
||||
llm.stop() |
||||
exit() |
||||
for doc in docs: |
||||
text = doc['text'] |
||||
prompt = f'Kolla på texten nedan: \n\n """{text}""" \n\n Sammanfatta förhöret med fokus på vad som sades, inte var det hölls eller annat formalia. Svara så kort som möjligt men var noga med detaljer som händelser som beskrivs, namn, datum och platser.\nKort sammanfattning:' |
||||
llm.add_request(doc['_key'], prompt) |
||||
|
||||
# Signal that all requests have been added |
||||
llm.finish_adding_requests() |
||||
|
||||
# Get the results |
||||
llm.get_results() |
||||
|
||||
|
||||
|
||||
# Initialize the LLM object |
||||
llm = LLM(chat=False, model="llama3:8b-instruct-q5_K_M") |
||||
|
||||
# Create a queue for requests and a queue for results |
||||
request_queue = queue.Queue() |
||||
result_queue = queue.Queue() |
||||
|
||||
# Create an event to signal when all requests have been added |
||||
all_requests_added_event = threading.Event() |
||||
all_results_processed_event = threading.Event() |
||||
|
||||
# Start a separate thread that runs the generate_concurrent method |
||||
threading.Thread(target=llm.generate_concurrent, args=(request_queue, result_queue, all_requests_added_event, all_results_processed_event)).start() |
||||
|
||||
# Add requests to the request queue |
||||
|
||||
interrogations = arango.db.collection('interrogations').all() |
||||
for doc in interrogations: |
||||
text = doc['text'] |
||||
prompt = f'Kolla på texten nedan: \n\n """{text}""" \n\n Sammanfatta förhöret med fokus på vad som sades, inte var det hölls eller annat formalia. Svara så kort som möjligt men var noga med detaljer som händelser som beskrivs, namn, datum och platser.\nKort sammanfattning:' |
||||
request_queue.put((doc['_key'], prompt)) |
||||
|
||||
# Signal that all requests have been added |
||||
all_requests_added_event.set() |
||||
|
||||
# Process the results |
||||
while True: |
||||
try: |
||||
# Take a result from the result queue |
||||
doc_id, summary = result_queue.get(timeout=1) |
||||
print("\033[92m" + doc_id + "\033[0m", summary) |
||||
# Update the document with the summary |
||||
arango.db.collection('interrogations').update_match({'_key': doc_id}, {'summary': summary}) |
||||
except queue.Empty: |
||||
# If the result queue is empty and all results have been processed, break the loop |
||||
if all_results_processed_event.is_set(): |
||||
break |
||||
else: |
||||
continue |
||||
Loading…
Reference in new issue