Add models, testing scripts, and result viewing functionality

- Implemented Pydantic models for article processing and summarization.
- Created `test_and_view.py` for testing LLM server document summarization.
- Developed `test_llm_server.py` for unit testing summarization functionality.
- Added `test_server.py` for additional testing of document and chunk summarization.
- Introduced `view_latest_results.py` to display the latest summaries from the LLM server.
- Established a structured plan for handling document chunks and their metadata.
- Enhanced error handling and user feedback in testing scripts.
main
lasseedfast 9 months ago
parent 5ee1a062f1
commit 62b68c3717
  1. 967
      _arango.py
  2. 257
      _base_class.py
  3. 800
      _bots.py
  4. 497
      _bots_dont_use.py
  5. 299
      _chromadb.py
  6. 574
      _llm.py
  7. 581
      _llmOLD.py
  8. 1448
      agent_research.py
  9. 335
      article2db.py
  10. 0
      bot_tools.py
  11. 1
      info.py
  12. 5
      llm_queries.py
  13. 373
      llm_server.py
  14. 19
      manage_users.py
  15. 334
      models.py
  16. 6
      ollama_response_classes.py
  17. 239
      projects_page.py
  18. 249
      research_page.py
  19. 36
      streamlit_app.py
  20. 737
      streamlit_chatbot.py
  21. 37
      test.py
  22. 31
      test_ tortoise.py
  23. 209
      test_and_view.py
  24. 51
      test_fairseq.py
  25. 91
      test_highlight.py
  26. 191
      test_llm_server.py
  27. 38
      test_ollama_client.py
  28. 9
      test_ollama_image.py
  29. 206
      test_research.py
  30. 123
      test_server.py
  31. 45
      test_tts.py
  32. 22
      test_tts_call_server.py
  33. 33
      tts_save_speaker.py
  34. 94
      utils.py
  35. 111
      view_latest_results.py

@ -1,75 +1,950 @@
import re
from arango import ArangoClient
from dotenv import load_dotenv
import os
from datetime import datetime
from dotenv import load_dotenv
from arango import ArangoClient
from arango.collection import StandardCollection as ArangoCollection
from models import UnifiedDataChunk, UnifiedSearchResults
from utils import fix_key
if "INFO" not in os.environ:
import env_manager
env_manager.set_env()
load_dotenv() # Install with pip install python-dotenv
COLLECTIONS_IN_BASE = [
"sci_articles",
]
class ArangoDB:
def __init__(self, user=None, password=None, db_name=None):
"""
ArangoDB Client Wrapper
This class provides a wrapper around the ArangoClient to simplify working with ArangoDB databases
and collections in a scientific document management context. It handles authentication, database
connections, and provides high-level methods for common operations.
Key features:
- Database and collection management
- Document CRUD operations (Create, Read, Update, Delete)
- AQL query execution
- Scientific article storage and retrieval
- Project and note management
- Chat history storage
- Settings management
Usage example:
arango = ArangoDB(user="admin", password="password")
# Create a collection
arango.create_collection("my_collection")
# Insert a document
doc = arango.insert_document("my_collection", {"name": "Test Document"})
# Query documents
results = arango.execute_aql("FOR doc IN my_collection RETURN doc")
Environment variables:
ARANGO_HOST: The ArangoDB host URL
ARANGO_PASSWORD: The default password for authentication
"""
def __init__(self, user="admin", password=None, db_name="base"):
"""
Initializes an instance of the ArangoClass.
Args:
db_name (str): The name of the database.
username (str): The username for authentication.
password (str): The password for authentication.
Initialize a connection to an ArangoDB database.
This constructor establishes a connection to an ArangoDB instance using the provided
credentials and database name. It uses environment variables for host and password
if not explicitly provided.
Parameters
----------
user : str, optional
Username for database authentication. Defaults to "admin".
If db_name is not "base", then user will be set to db_name.
password : str, optional
Password for database authentication. If not provided,
the password will be retrieved from the ARANGO_PASSWORD environment variable.
db_name : str, optional
Name of the database to connect to. Defaults to "base".
If not "base", this value will also be used as the username.
Notes
-----
- The host URL is always retrieved from the ARANGO_HOST environment variable.
- For the "base" database, the username will be either "admin" or the provided user.
- For other databases, the username will be the same as the database name.
Attributes
----------
user : str
The username used for authentication.
password : str
The password used for authentication.
db_name : str
The name of the connected database.
client : ArangoClient
The ArangoDB client instance.
db : Database
The database instance for executing operations.
"""
host = os.getenv("ARANGO_HOST")
if not password:
password = os.getenv("ARANGO_PASSWORD")
if not db_name:
if user:
db_name = user
else:
db_name = os.getenv("ARANGO_DB")
if not user:
user = os.getenv("ARANGO_USER")
self.password = os.getenv("ARANGO_PASSWORD")
# This is the default user for the base database
if db_name != "base":
self.user = db_name
self.db_name = db_name
elif user == "admin":
self.user = "admin"
self.db_name = "base"
else:
self.user = user
self.db_name = user
self.client = ArangoClient(hosts=host)
if user=='lasse': #! This need to be fixed to work with all users!
password = os.getenv("ARANGO_PWD_LASSE")
self.db = self.client.db(db_name, username=user, password=password)
self.db = self.client.db(
self.db_name, username=self.user, password=self.password
)
def fix_key(self, _key):
return fix_key(_key)
# Collection operations
def get_collection(self, collection_name: str) -> ArangoCollection:
"""
Get a collection by name.
Args:
collection_name (str): The name of the collection.
def fix_key(self, _key):
Returns:
ArangoCollection: The collection object.
"""
return self.db.collection(collection_name)
def has_collection(self, collection_name: str) -> bool:
"""
Check if a collection exists.
Args:
collection_name (str): The name of the collection.
Returns:
bool: True if the collection exists, False otherwise.
"""
return self.db.has_collection(collection_name)
def create_collection(self, collection_name: str) -> ArangoCollection:
"""
Create a new collection.
Args:
collection_name (str): The name of the collection to create.
Returns:
ArangoCollection: The created collection.
"""
return self.db.create_collection(collection_name)
def delete_collection(self, collection_name: str) -> bool:
"""
Delete a collection.
Args:
collection_name (str): The name of the collection to delete.
Returns:
bool: True if the collection was deleted successfully.
"""
if self.has_collection(collection_name):
return self.db.delete_collection(collection_name)
return False
def truncate_collection(self, collection_name: str) -> bool:
"""
Truncate a collection (remove all documents).
Args:
collection_name (str): The name of the collection to truncate.
Returns:
bool: True if the collection was truncated successfully.
"""
if self.has_collection(collection_name):
return self.db.collection(collection_name).truncate()
return False
# Document operations
def get_document(self, document_id: str):
"""
Get a document by ID.
Args:
document_id (str): The ID of the document to get.
Returns:
dict: The document if found, None otherwise.
"""
try:
return self.db.document(document_id)
except:
return None
def has_document(self, collection_name: str, document_key: str) -> bool:
"""
Check if a document exists in a collection.
Args:
collection_name (str): The name of the collection.
document_key (str): The key of the document.
Returns:
bool: True if the document exists, False otherwise.
"""
return self.db.collection(collection_name).has(document_key)
def insert_document(
self,
collection_name: str,
document: dict,
overwrite: bool = False,
overwrite_mode: str = "update",
keep_none: bool = False,
):
"""
Insert a document into a collection.
Args:
collection_name (str): The name of the collection.
document (dict): The document to insert.
overwrite (bool, optional): Whether to overwrite an existing document. Defaults to False.
overwrite_mode (str, optional): The mode for overwriting ('replace' or 'update'). Defaults to "replace".
keep_none (bool, optional): Whether to keep None values. Defaults to False.
Returns:
dict: The inserted document with its metadata (_id, _key, etc.)
"""
assert '_id' in document or '_key' in document, "Document must have either _id or _key"
if '_id' not in document:
document['_id'] = f"{collection_name}/{document['_key']}"
return self.db.collection(collection_name).insert(
document,
overwrite=overwrite,
overwrite_mode=overwrite_mode,
keep_none=keep_none,
)
def update_document(
self, document: dict, check_rev: bool = False, silent: bool = False
):
"""
Update a document that already has _id or _key.
Args:
document (dict): The document to update.
check_rev (bool, optional): Whether to check document revision. Defaults to False.
silent (bool, optional): Whether to return the updated document. Defaults to False.
Returns:
dict: The updated document if silent is False.
"""
return self.db.update_document(document, check_rev=check_rev, silent=silent)
def update_document_by_match(
self, collection_name: str, filters: dict, body: dict, merge: bool = True
):
"""
Update documents that match a filter.
Args:
collection_name (str): The name of the collection.
filters (dict): The filter to match documents.
body (dict): The update to apply.
merge (bool, optional): Whether to merge the update with existing data. Defaults to True.
Returns:
dict: The result of the update operation.
"""
return self.db.collection(collection_name).update_match(
filters=filters, body=body, merge=merge
)
def delete_document(self, collection_name: str, document_key: str):
"""
Delete a document from a collection.
Args:
collection_name (str): The name of the collection.
document_key (str): The key of the document to delete.
Returns:
dict: The deletion result.
"""
return self.db.collection(collection_name).delete(document_key)
def delete_document_by_match(self, collection_name: str, filters: dict):
"""
Delete documents that match a filter.
Args:
collection_name (str): The name of the collection.
filters (dict): The filter to match documents.
Returns:
dict: The deletion result.
"""
return self.db.collection(collection_name).delete_match(filters=filters)
# Query operations
def execute_aql(self, query: str, bind_vars: dict = None):
"""
Execute an AQL query.
Args:
query (str): The AQL query to execute.
bind_vars (dict, optional): Bind variables for the query. Defaults to None.
Returns:
Cursor: A cursor to the query results.
"""
return self.db.aql.execute(query, bind_vars=bind_vars)
def get_all_documents(self, collection_name: str):
"""
Get all documents from a collection.
Args:
collection_name (str): The name of the collection.
Returns:
list: All documents in the collection.
"""
return list(self.db.collection(collection_name).all())
# Database operations
def has_database(self, db_name: str) -> bool:
"""
Check if a database exists.
Args:
db_name (str): The name of the database.
Returns:
bool: True if the database exists, False otherwise.
"""
return self.client.has_database(db_name)
def create_database(self, db_name: str, users: list = None) -> bool:
"""
Create a new database.
Args:
db_name (str): The name of the database to create.
users (list, optional): List of user objects with access to the database. Defaults to None.
Returns:
bool: True if the database was created successfully.
"""
return self.client.create_database(db_name, users=users)
def delete_database(self, db_name: str) -> bool:
"""
Delete a database.
Args:
db_name (str): The name of the database to delete.
Returns:
bool: True if the database was deleted successfully.
"""
if self.client.has_database(db_name):
return self.client.delete_database(db_name)
return False
# Domain-specific operations
# Scientific Articles
def get_article(
self,
article_key: str,
db_name: str = None,
collection_name: str = "sci_articles",
):
"""
Get a scientific article by key.
Args:
article_key (str): The key of the article.
db_name (str, optional): The database name to search in. Defaults to current database.
Returns:
dict: The article document if found, None otherwise.
"""
try:
return self.db.collection("sci_articles").get(article_key)
except Exception as e:
print(f"Error retrieving article {article_key}: {e}")
raise e
return None
def get_article_by_doi(self, doi: str):
"""
Get a scientific article by DOI.
Args:
doi (str): The DOI of the article.
Returns:
dict: The article document if found, None otherwise.
"""
query = """
FOR doc IN sci_articles
FILTER doc.metadata.doi == @doi
RETURN doc
"""
cursor = self.db.aql.execute(query, bind_vars={"doi": doi})
try:
return next(cursor)
except StopIteration:
return None
def get_document_text(
self, _id: str = None, _key: str = None, collection: str = None
):
"""
Get the text content of a document. If _key is used, collection must be provided.
* Use base_arango for sci_articles and user_arango for other collections. *
Args:
_id (str, optional): The ID of the document. Defaults to None.
_key (str, optional): The key of the document. Defaults to None.
collection (str, optional): The name of the collection. Defaults to None.
Returns:
str: The text content of the document, or None if not found.
"""
if collection == "sci_articles" or _id.startswith("sci_articles"):
assert (
self.db_name == "base"
), "If requesting sci_articles base_arango must be used"
else:
assert (
self.db_name != "base"
), "If not requesting sci_articles user_arango must be used"
try:
if _id:
doc = self.db.document(_id)
elif _key:
assert (
collection is not None
), "Collection name must be provided if _key is used"
doc = self.db.collection(collection).get(_key)
text = [chunk.get("text") for chunk in doc.get("chunks", [])]
except Exception as e:
print(f"Error retrieving text for document {_id or _key}: {e}")
return None
return "\n".join(text) if text else None
def store_article_chunks(
self, article_data: dict, chunks: list, document_key: str = None
):
"""
Store article chunks in the database.
Args:
article_data (dict): The article metadata.
chunks (list): The chunks of text from the article.
document_key (str, optional): The key to use for the document. Defaults to None.
Returns:
tuple: (document_id, database_name, document_doi)
"""
Sanitize a given key by replacing all characters that are not alphanumeric,
underscore, hyphen, dot, at symbol, parentheses, plus, equals, semicolon,
dollar sign, asterisk, single quote, percent, or colon with an underscore.
collection = "sci_articles"
arango_chunks = []
for index, chunk in enumerate(chunks):
chunk_id = f"{document_key}_{index}" if document_key else f"chunk_{index}"
page_numbers = chunk.get("pages", [])
text = chunk.get("text", "")
arango_chunks.append({"text": text, "pages": page_numbers, "id": chunk_id})
arango_document = {
"_key": document_key,
"chunks": arango_chunks,
"metadata": article_data.get("metadata", {}),
}
if article_data.get("summary"):
arango_document["summary"] = article_data.get("summary")
if article_data.get("doi"):
arango_document["crossref"] = True
doc = self.insert_document(
collection_name=collection,
document=arango_document,
overwrite=True,
overwrite_mode="update",
keep_none=False,
)
return doc["_id"], self.db_name, article_data.get("doi")
def add_article_to_collection(self, article_id: str, collection_name: str):
"""
Add an article to a user's article collection.
Args:
_key (str): The key to be sanitized.
article_id (str): The ID of the article.
collection_name (str): The name of the user's collection.
Returns:
str: The sanitized key with disallowed characters replaced by underscores.
bool: True if the article was added successfully.
"""
query = """
FOR collection IN article_collections
FILTER collection.name == @collection_name
UPDATE collection WITH {
articles: PUSH(collection.articles, @article_id)
} IN article_collections
RETURN NEW
"""
cursor = self.db.aql.execute(
query,
bind_vars={"collection_name": collection_name, "article_id": article_id},
)
try:
return next(cursor) is not None
except StopIteration:
return False
def remove_article_from_collection(self, article_id: str, collection_name: str):
"""
Remove an article from a user's article collection.
Args:
article_id (str): The ID of the article.
collection_name (str): The name of the user's collection.
Returns:
bool: True if the article was removed successfully.
"""
query = """
FOR collection IN article_collections
FILTER collection.name == @collection_name
UPDATE collection WITH {
articles: REMOVE_VALUE(collection.articles, @article_id)
} IN article_collections
RETURN NEW
"""
cursor = self.db.aql.execute(
query,
bind_vars={"collection_name": collection_name, "article_id": article_id},
)
try:
return next(cursor) is not None
except StopIteration:
return False
# Projects
def get_projects(self, username: str = None):
"""
Get all projects for a user.
Returns:
list: A list of project documents.
"""
if username:
query = """
FOR p IN projects
SORT p.name ASC
RETURN p
"""
return list(self.db.aql.execute(query))
else:
return self.get_all_documents("projects")
def get_project(self, project_name: str, username: str = None):
"""
Get a project by name.
Args:
project_name (str): The name of the project.
Returns:
dict: The project document if found, None otherwise.
"""
if username:
query = """
FOR p IN projects
FILTER p.name == @project_name
RETURN p
"""
cursor = self.db.aql.execute(
query, bind_vars={"project_name": project_name}
)
try:
return next(cursor)
except StopIteration:
return None
else:
query = """
FOR p IN projects
FILTER p.name == @project_name
RETURN p
"""
cursor = self.db.aql.execute(
query, bind_vars={"project_name": project_name}
)
try:
return next(cursor)
except StopIteration:
return None
def create_project(self, project_data: dict):
"""
Create a new project.
Args:
project_data (dict): The project data.
Returns:
dict: The created project document.
"""
return self.insert_document("projects", project_data)
def update_project(self, project_data: dict):
"""
Update an existing project.
Args:
project_data (dict): The project data.
Returns:
dict: The updated project document.
"""
return self.update_document(project_data, check_rev=False)
def delete_project(self, project_name: str, username: str = None):
"""
Delete a project.
Args:
project_name (str): The name of the project.
username (str, optional): The username. Defaults to None.
Returns:
bool: True if the project was deleted successfully.
"""
filters = {"name": project_name}
if username:
filters["username"] = username
return self.delete_document_by_match("projects", filters)
def get_project_notes(self, project_name: str, username: str = None):
"""
Get notes for a project.
Args:
project_name (str): The name of the project.
username (str, optional): The username. Defaults to None.
Returns:
list: A list of note documents.
"""
query = """
FOR note IN notes
FILTER note.project == @project_name
"""
if username:
query += " AND note.username == @username"
query += """
SORT note.timestamp DESC
RETURN note
"""
bind_vars = {"project_name": project_name}
if username:
bind_vars["username"] = username
return list(self.db.aql.execute(query, bind_vars=bind_vars))
def add_note_to_project(self, note_data: dict):
"""
Add a note to a project.
Args:
note_data (dict): The note data.
Returns:
dict: The created note document.
"""
return self.insert_document("notes", note_data)
def fetch_notes_tool(
self, project_name: str, username: str = None
) -> UnifiedSearchResults:
"""
Fetch notes for a project and return them in a unified format.
Args:
project_name (str): The name of the project.
username (str, optional): The username. Defaults to None.
Returns:
UnifiedSearchResults: A unified representation of the notes.
"""
notes = self.get_project_notes(project_name, username)
chunks = []
source_ids = []
for note in notes:
chunk = UnifiedDataChunk(
content=note.get("content", ""),
metadata={
"title": note.get("title", "No title"),
"timestamp": note.get("timestamp", ""),
},
source_type="note",
)
chunks.append(chunk)
source_ids.append(note.get("_id", "unknown_id"))
return UnifiedSearchResults(chunks=chunks, source_ids=source_ids)
# Chat operations
def get_chat(self, chat_key: str):
"""
Get a chat by key.
Args:
chat_key (str): The key of the chat.
Returns:
dict: The chat document if found, None otherwise.
"""
try:
return self.db.collection("chats").get(chat_key)
except:
return None
def create_or_update_chat(self, chat_data: dict):
"""
Create or update a chat.
Args:
chat_data (dict): The chat data.
Returns:
dict: The created or updated chat document.
"""
return self.insert_document("chats", chat_data, overwrite=True)
def get_chats_for_project(self, project_name: str, username: str = None):
"""
Get all chats for a project.
Args:
project_name (str): The name of the project.
username (str, optional): The username. Defaults to None.
Returns:
list: A list of chat documents.
"""
query = """
FOR chat IN chats
FILTER chat.project == @project_name
"""
if username:
query += " AND chat.username == @username"
query += """
SORT chat.timestamp DESC
RETURN chat
"""
bind_vars = {"project_name": project_name}
if username:
bind_vars["username"] = username
return list(self.db.aql.execute(query, bind_vars=bind_vars))
def delete_chat(self, chat_key: str):
"""
Delete a chat.
Args:
chat_key (str): The key of the chat.
Returns:
dict: The deletion result.
"""
return self.delete_document("chats", chat_key)
def delete_old_chats(self, days: int = 30):
"""
Delete chats older than a certain number of days.
Args:
days (int, optional): The number of days. Defaults to 30.
Returns:
int: The number of deleted chats.
"""
query = """
FOR chat IN chats
FILTER DATE_DIFF(chat.timestamp, DATE_NOW(), "d") > @days
REMOVE chat IN chats
RETURN OLD
"""
cursor = self.db.aql.execute(query, bind_vars={"days": days})
return len(list(cursor))
# Settings operations
def get_settings(self):
"""
Get settings document.
Returns:
dict: The settings document if found, None otherwise.
"""
try:
return self.db.document("settings/settings")
except:
return None
def initialize_settings(self, settings_data: dict):
"""
Initialize settings.
Args:
settings_data (dict): The settings data.
Returns:
dict: The created settings document.
"""
settings_data["_key"] = "settings"
return self.insert_document("settings", settings_data)
def update_settings(self, settings_data: dict):
"""
Update settings.
Args:
settings_data (dict): The settings data.
Returns:
dict: The updated settings document.
"""
return self.update_document_by_match(
collection_name="settings", filters={"_key": "settings"}, body=settings_data
)
def get_document_metadata(self, document_id: str) -> dict:
"""
Retrieve document metadata with merged user notes if available.
This method determines the appropriate database based on the document ID,
retrieves the document, and enriches its metadata with any user notes.
Args:
document_id (str): The document ID to retrieve metadata for
Returns:
dict: The document metadata dictionary, or empty dict if not found
"""
if not document_id:
return {}
try:
# Determine which database to use based on document ID prefix
if document_id.startswith("sci_articles"):
# Science articles are in the base database
db_to_use = self.client.db(
"base",
username=os.getenv("ARANGO_USER"),
password=os.getenv("ARANGO_PASSWORD"),
)
arango_doc = db_to_use.document(document_id)
else:
# User documents are in the user's database
arango_doc = self.db.document(document_id)
if not arango_doc:
return {}
# Get metadata and merge user notes if available
arango_metadata = arango_doc.get("metadata", {})
if "user_notes" in arango_doc:
arango_metadata["user_notes"] = arango_doc["user_notes"]
return arango_metadata
except Exception as e:
print(f"Error retrieving metadata for document {document_id}: {e}")
return {}
def summarise_chunks(self, document: dict, is_sci=False):
from _llm import LLM
from models import ArticleChunk
assert "_id" in document, "Document must have an _id field"
if is_sci:
system_message = """You are a science assistant summarizing scientific articles.
You will get an article chunk by chunk, and you have three tasks for each chunk:
1. Summarize the content of the chunk.
2. Tag the chunk with relevant tags.
3. Extract the scientific references from the chunk.
"""
else:
system_message = """You are a general assistant summarizing articles.
You will get an article chunk by chunk, and you have two tasks for each chunk:
1. Summarize the content of the chunk.
2. Tag the chunk with relevant tags.
"""
system_message += """\nPlease make use of the previous chunks you have already seen to understand the current chunk in context and make the summary stand for itself. But remember, *it is the current chunk you are summarizing*
ONLY use the information in the chunks to make the summary, and do not add any information that is not in the chunks."""
llm = LLM(system_message=system_message)
chunks = []
for chunk in document["chunks"]:
if "summary" in chunk:
chunks.append(chunk)
continue
prompt = f"""Summarize the following text to make it stand on its own:\n
'''
{chunk['text']}
'''\n
Your tasks are:
1. Summarize the content of the chunk. Make sure to include all relevant details!
2. Tag the chunk with relevant tags.
"""
if is_sci:
prompt += "\n3. Extract the scientific references mentioned in this specific chunk. If there is a DOI reference, include that in the reference. Sometimes the reference is only a number in brackets, like [1], so make sure to include that as well (in brackets)."
prompt += "\nONLY use the information in the chunks to make the summary, and do not add any information that is not in the chunks."
return re.sub(r"[^A-Za-z0-9_\-\.@()+=;$!*\'%:]", "_", _key)
try:
response = llm.generate(prompt, format=ArticleChunk.model_json_schema())
structured_response = ArticleChunk.model_validate_json(response.content)
chunk["summary"] = structured_response.summary
chunk["tags"] = [i.lower() for i in structured_response.tags]
chunk["summary_meta"] = {
"model": llm.model,
"date": datetime.now().strftime("%Y-%m-%d"),
}
except Exception as e:
print(f"Error processing chunk: {e}")
chunks.append(chunk)
document["chunks"] = chunks
self.update_document(document, check_rev=False)
if __name__ == "__main__":
arango = ArangoDB(user='lasse')
random_doc = arango.db.aql.execute(
"FOR doc IN other_documents LIMIT 1 RETURN doc"
)
print(next(random_doc))
arango = ArangoDB(db_name='base')
articles = arango.db.collection('sci_articles').all()
for article in articles:
if 'metadata' in article and article['metadata']:
if 'abstract' in article['metadata']:
abstract = article['metadata']['abstract']
if isinstance(abstract, str):
# Remove text within <> brackets and the brackets themselves
article['metadata']['abstract'] = re.sub(r'<[^>]*>', '', abstract)
arango.db.collection('sci_articles').update_match(
filters={'_key': article['_key']},
body={'metadata': article['metadata']},
merge=True
)
print(f"Updated abstract for {article['_key']}")

@ -5,17 +5,17 @@ import streamlit as st
from _arango import ArangoDB
from _chromadb import ChromaDB
class BaseClass:
def __init__(self, username: str, **kwargs) -> None:
self.username: str = username
self.project_name: str = kwargs.get('project_name', None)
self.collection: str = kwargs.get('collection_name', None)
self.project_name: str = kwargs.get("project_name", None)
self.collection: str = kwargs.get("collection_name", None)
self.user_arango: ArangoDB = self.get_arango()
self.base_arango: ArangoDB = self.get_arango(admin=True)
for key, value in kwargs.items():
setattr(self, key, value)
def get_arango(self, admin: bool = False, db_name: str = None) -> ArangoDB:
if db_name:
return ArangoDB(db_name=db_name)
@ -25,29 +25,41 @@ class BaseClass:
return ArangoDB(user=self.username, db_name=self.username)
def get_article_collections(self) -> list:
article_collections = self.user_arango.db.aql.execute(
"""
Gets the names of all article collections for the current user.
Returns:
list: A list of article collection names.
"""
article_collections = self.user_arango.execute_aql(
'FOR doc IN article_collections RETURN doc["name"]'
)
return list(article_collections)
def get_projects(self) -> list:
projects = self.user_arango.db.aql.execute(
'FOR doc IN projects RETURN doc["name"]'
)
return list(projects)
"""
Gets the names of all projects for the current user.
Returns:
list: A list of project names.
"""
projects = self.user_arango.get_projects(username=self.username)
return [project["name"] for project in projects]
def get_chromadb(self):
return ChromaDB()
def get_project(self, project_name: str):
doc = self.user_arango.db.aql.execute(
f'FOR doc IN projects FILTER doc["name"] == "{project_name}" RETURN doc',
count=True,
)
if doc:
return doc.next()
"""
Get a project by name for the current user.
Args:
project_name (str): The name of the project.
Returns:
dict: The project document if found, None otherwise.
"""
return self.user_arango.get_project(project_name, username=self.username)
def set_filename(self, filename=None, folder="other_documents"):
"""
@ -77,6 +89,12 @@ class BaseClass:
self.file_path = file_path + ".pdf"
return file_path
def remove_thinking(self, response):
"""Remove the thinking section from the response"""
response_text = response.content if hasattr(response, "content") else str(response)
if "</think>" in response_text:
return response_text.split("</think>")[1].strip()
return response_text
class StreamlitBaseClass(BaseClass):
"""
@ -98,10 +116,11 @@ class StreamlitBaseClass(BaseClass):
Displays a select box for choosing a collection of favorite articles. Updates the current collection in the session state and the database.
choose_project(text="Select a project") -> str:
Displays a select box for choosing a project. Updates the current project in the session state and the database.
"""
"""
def __init__(self, username: str, **kwargs) -> None:
super().__init__(username, **kwargs)
def get_settings(self, field: str = None):
"""
Retrieve or initialize user settings from the database.
@ -112,24 +131,31 @@ class StreamlitBaseClass(BaseClass):
are then stored in the Streamlit session state.
Args:
field (str, optional): The specific field to retrieve from the settings.
field (str, optional): The specific field to retrieve from the settings.
If not provided, the entire settings document is returned.
Returns:
dict or any: The entire settings document if no field is specified,
dict or any: The entire settings document if no field is specified,
otherwise the value of the specified field.
"""
settings = self.user_arango.db.document("settings/settings")
settings = self.user_arango.get_settings()
if not settings:
self.user_arango.db.collection("settings").insert(
{"_key": "settings", "current_collection": None, "current_page": None}
)
default_settings = {
"_key": "settings",
"current_collection": None,
"current_page": None,
}
self.user_arango.initialize_settings(default_settings)
settings = default_settings
# Ensure required fields exist
for i in ["current_collection", "current_page"]:
if i not in settings:
settings[i] = None
st.session_state["settings"] = settings
if field:
return settings[field]
return settings.get(field)
return settings
def update_settings(self, key, value) -> None:
@ -189,7 +215,6 @@ class StreamlitBaseClass(BaseClass):
st.session_state["current_page"] = page_name
self.update_settings("current_page", page_name)
def choose_collection(self, text="Select a collection of favorite articles") -> str:
"""
Prompts the user to select a collection of favorite articles from a list.
@ -214,7 +239,7 @@ class StreamlitBaseClass(BaseClass):
self.update_settings("current_collection", collection)
self.update_session_state()
return collection
def choose_project(self, text="Select a project") -> str:
"""
Prompts the user to select a project from a list of available projects.
@ -231,16 +256,188 @@ class StreamlitBaseClass(BaseClass):
- Prints the chosen project name to the console.
"""
projects = self.get_projects()
print('projects', projects)
print("projects", projects)
print(self.project_name)
project = st.selectbox(text, projects, index=projects.index(self.project_name) if self.project_name in projects else None)
print('Choosing project...')
project = st.selectbox(
text,
projects,
index=(
projects.index(self.project_name)
if self.project_name in projects
else None
),
)
print("Choosing project...")
if project:
from projects_page import Project
self.project = Project(self.username, project, self.user_arango)
self.collection = None
self.update_settings("current_project", self.project.name)
self.update_session_state()
print('CHOOSEN PROJECT:', self.project.name)
print("CHOOSEN PROJECT:", self.project.name)
return self.project
def add_article_to_collection(self, article_id: str, collection_name: str = None):
"""
Add an article to a user's collection.
Args:
article_id (str): The ID of the article.
collection_name (str, optional): The name of the collection. Defaults to current collection.
Returns:
bool: True if the article was added successfully.
"""
if collection_name is None:
collection_name = self.collection
return self.user_arango.add_article_to_collection(article_id, collection_name)
def remove_article_from_collection(
self, article_id: str, collection_name: str = None
):
"""
Remove an article from a user's collection.
Args:
article_id (str): The ID of the article.
collection_name (str, optional): The name of the collection. Defaults to current collection.
Returns:
bool: True if the article was removed successfully.
"""
if collection_name is None:
collection_name = self.collection
return self.user_arango.remove_article_from_collection(
article_id, collection_name
)
def get_project_notes(self, project_name: str = None):
"""
Get notes for a project.
Args:
project_name (str, optional): The name of the project. Defaults to current project.
Returns:
list: A list of note documents.
"""
if project_name is None:
project_name = self.project_name
return self.user_arango.get_project_notes(project_name, username=self.username)
def add_note_to_project(self, note_data: dict):
"""
Add a note to a project.
Args:
note_data (dict): The note data. Should contain project, username, and timestamp.
Returns:
dict: The created note document.
"""
if "project" not in note_data:
note_data["project"] = self.project_name
if "username" not in note_data:
note_data["username"] = self.username
return self.user_arango.add_note_to_project(note_data)
def create_project(self, project_data: dict):
"""
Create a new project for the current user.
Args:
project_data (dict): The project data. Should include a name field.
Returns:
dict: The created project document.
"""
if "username" not in project_data:
project_data["username"] = self.username
return self.user_arango.create_project(project_data)
def update_project(self, project_data: dict):
"""
Update an existing project.
Args:
project_data (dict): The project data. Must include _key.
Returns:
dict: The updated project document.
"""
return self.user_arango.update_project(project_data)
def delete_project(self, project_name: str):
"""
Delete a project for the current user.
Args:
project_name (str): The name of the project.
Returns:
bool: True if the project was deleted successfully.
"""
return self.user_arango.delete_project(project_name, username=self.username)
def get_chat(self, chat_key: str):
"""
Get a chat by key.
Args:
chat_key (str): The key of the chat.
Returns:
dict: The chat document if found, None otherwise.
"""
return self.user_arango.get_chat(chat_key)
def create_or_update_chat(self, chat_data: dict):
"""
Create or update a chat.
Args:
chat_data (dict): The chat data.
Returns:
dict: The created or updated chat document.
"""
if "username" not in chat_data:
chat_data["username"] = self.username
return self.user_arango.create_or_update_chat(chat_data)
def get_chats_for_project(self, project_name: str = None):
"""
Get all chats for a project.
Args:
project_name (str, optional): The name of the project. Defaults to current project.
Returns:
list: A list of chat documents.
"""
if project_name is None:
project_name = self.project_name
return self.user_arango.get_chats_for_project(
project_name, username=self.username
)
def delete_chat(self, chat_key: str):
"""
Delete a chat.
Args:
chat_key (str): The key of the chat.
Returns:
dict: The deletion result.
"""
return self.user_arango.delete_chat(chat_key)

@ -1,800 +0,0 @@
from datetime import datetime
import streamlit as st
from _base_class import StreamlitBaseClass, BaseClass
from _llm import LLM
from prompts import *
from colorprinter.print_color import *
from llm_tools import ToolRegistry
class Chat(StreamlitBaseClass):
def __init__(self, username=None, **kwargs):
super().__init__(username=username, **kwargs)
self.name = kwargs.get("name", None)
self.chat_history = kwargs.get("chat_history", [])
def add_message(self, role, content):
self.chat_history.append(
{
"role": role,
"content": content.strip().strip('"'),
"role_type": self.role,
}
)
def to_dict(self):
return {
"_key": self._key,
"name": self.name,
"chat_history": self.chat_history,
"role": self.role,
"username": self.username,
}
def update_in_arango(self):
self.last_updated = datetime.now().isoformat()
self.user_arango.db.collection("chats").insert(
self.to_dict(), overwrite=True, overwrite_mode="update"
)
def set_name(self, user_input):
llm = LLM(
model="small",
max_length_answer=50,
temperature=0.4,
system_message="You are a chatbot who will be chatting with a user",
)
prompt = (
f'Give a short name to the chat based on this user input: "{user_input}" '
"No more than 30 characters. Answer ONLY with the name of the chat."
)
name = llm.generate(prompt).content.strip('"')
name = f'{name} - {datetime.now().strftime("%B %d")}'
existing_chat = self.user_arango.db.aql.execute(
f'FOR doc IN chats FILTER doc.name == "{name}" RETURN doc', count=True
)
if existing_chat.count() > 0:
name = f'{name} ({datetime.now().strftime("%H:%M")})'
name += f" - [{self.role}]"
self.name = name
return name
@classmethod
def from_dict(cls, data):
return cls(
username=data.get("username"),
name=data.get("name"),
chat_history=data.get("chat_history", []),
role=data.get("role", "Research Assistant"),
_key=data.get("_key"),
)
def chat_history2bot(self, n_messages: int = None, remove_system: bool = False):
history = [
{"role": m["role"], "content": m["content"]} for m in self.chat_history
]
if n_messages and len(history) > n_messages:
history = history[-n_messages:]
if (
all([history[0]["role"] == "system", remove_system])
or history[0]["role"] == "assistant"
):
history = history[1:]
return history
class Bot(BaseClass):
def __init__(self, username: str, chat: Chat = None, tools: list = None, **kwargs):
super().__init__(username=username, **kwargs)
# Use the passed in chat or create a new Chat
self.chat = chat if chat else Chat(username=username, role="Research Assistant")
print_yellow(f"Chat:", chat, type(chat))
# Store or set up project/collection if available
self.project = kwargs.get("project", None)
self.collection = kwargs.get("collection", None)
if self.collection and not isinstance(self.collection, list):
self.collection = [self.collection]
# Load articles in the collections
self.arango_ids = []
if self.collection:
for c in self.collection:
for _id in self.user_arango.db.aql.execute(
"""
FOR doc IN article_collections
FILTER doc.name == @collection
FOR article IN doc.articles
RETURN article._id
""",
bind_vars={"collection": c},
):
self.arango_ids.append(_id)
# A standard LLM for normal chat
self.chatbot = LLM(messages=self.chat.chat_history2bot())
# A helper bot for generating queries or short prompts
self.helperbot = LLM(
temperature=0,
model="small",
max_length_answer=500,
system_message=get_query_builder_system_message(),
messages=self.chat.chat_history2bot(n_messages=4, remove_system=True),
)
# A specialized LLM picking which tool to use
self.toolbot = LLM(
temperature=0,
system_message="""
You are an assistant bot helping an answering bot to answer a user's messages.
Your task is to choose one or multiple tools that will help the answering bot to provide the user with the best possible answer.
You should NEVER directly answer the user. You MUST choose a tool.
""",
chat=False,
model="small",
)
# Load or register the passed-in tools
if tools:
self.tools = ToolRegistry.get_tools(tools=tools)
else:
self.tools = ToolRegistry.get_tools()
# Store other kwargs
for arg in kwargs:
setattr(self, arg, kwargs[arg])
def get_chunks(
self,
user_input,
collections=["sci_articles", "other_documents"],
n_results=7,
n_sources=4,
filter=True,
):
# Basic version without Streamlit calls
query = self.helperbot.generate(
get_generate_vector_query_prompt(user_input, self.chat.role)
).content.strip('"')
combined_chunks = []
if collections:
for collection in collections:
where_filter = {"_id": {"$in": self.arango_ids}} if filter else {}
chunks = self.get_chromadb().query(
query=query,
collection=collection,
n_results=n_results,
n_sources=n_sources,
where=where_filter,
max_retries=3,
)
for doc, meta, dist in zip(
chunks["documents"][0],
chunks["metadatas"][0],
chunks["distances"][0],
):
combined_chunks.append(
{"document": doc, "metadata": meta, "distance": dist}
)
combined_chunks.sort(key=lambda x: x["distance"])
# Keep the best chunks according to n_sources
sources = set()
closest_chunks = []
for chunk in combined_chunks:
source_id = chunk["metadata"].get("_id", "no_id")
if source_id not in sources:
sources.add(source_id)
closest_chunks.append(chunk)
if len(sources) >= n_sources:
break
if len(closest_chunks) < n_results:
remaining_chunks = [
c for c in combined_chunks if c not in closest_chunks
]
closest_chunks.extend(remaining_chunks[: n_results - len(closest_chunks)])
# Now fetch real metadata from Arango
for chunk in closest_chunks:
_id = chunk["metadata"].get("_id")
if not _id:
continue
if _id.startswith("sci_articles"):
arango_doc = self.base_arango.db.document(_id)
else:
arango_doc = self.user_arango.db.document(_id)
if arango_doc:
arango_metadata = arango_doc.get("metadata", {})
# Possibly merge notes
if "user_notes" in arango_doc:
arango_metadata["user_notes"] = arango_doc["user_notes"]
chunk["metadata"] = arango_metadata
# Group by article title
grouped_chunks = {}
article_number = 1
for chunk in closest_chunks:
title = chunk["metadata"].get("title", "No title")
chunk["article_number"] = article_number
if title not in grouped_chunks:
grouped_chunks[title] = {
"article_number": article_number,
"chunks": [],
}
article_number += 1
grouped_chunks[title]["chunks"].append(chunk)
return grouped_chunks
def answer_tool_call(self, response, user_input):
bot_responses = []
# This method returns / stores responses (no Streamlit calls)
if not response.get("tool_calls"):
return ""
for tool in response.get("tool_calls"):
function_name = tool.function.get('name')
arguments = tool.function.arguments
arguments["query"] = user_input
if hasattr(self, function_name):
if function_name in [
"fetch_other_documents_tool",
"fetch_science_articles_tool",
"fetch_science_articles_and_other_documents_tool",
]:
chunks = getattr(self, function_name)(**arguments)
bot_responses.append(
self.generate_from_chunks(user_input, chunks).strip('"')
)
elif function_name == "fetch_notes_tool":
notes = getattr(self, function_name)()
bot_responses.append(
self.generate_from_notes(user_input, notes).strip('"')
)
elif function_name == "conversational_response_tool":
bot_responses.append(
getattr(self, function_name)(user_input).strip('"')
)
return "\n\n".join(bot_responses)
def process_user_input(self, user_input, content_attachment=None):
# Add user message
self.chat.add_message("user", user_input)
if not content_attachment:
prompt = get_tools_prompt(user_input)
response = self.toolbot.generate(prompt, tools=self.tools, stream=False)
if response.get("tool_calls"):
bot_response = self.answer_tool_call(response, user_input)
else:
# Just respond directly
bot_response = response.content.strip('"')
else:
# If there's an attachment, do something minimal
bot_response = "Content attachment received (Base Bot)."
# Add assistant message
if self.chat.chat_history[-1]["role"] != "assistant":
self.chat.add_message("assistant", bot_response)
# Update in Arango
self.chat.update_in_arango()
return bot_response
def generate_from_notes(self, user_input, notes):
# No Streamlit calls
notes_string = ""
for note in notes:
notes_string += f"\n# {note.get('title','No title')}\n{note.get('content','')}\n---\n"
prompt = get_chat_prompt(user_input, content_string=notes_string, role=self.chat.role)
return self.chatbot.generate(prompt, stream=True)
def generate_from_chunks(self, user_input, chunks):
# No Streamlit calls
chunks_string = ""
for title, group in chunks.items():
user_notes_string = ""
if "user_notes" in group["chunks"][0]["metadata"]:
notes = group["chunks"][0]["metadata"]["user_notes"]
user_notes_string = f'\n\nUser notes:\n"""\n{notes}\n"""\n\n'
docs = "\n(...)\n".join([c["document"] for c in group["chunks"]])
chunks_string += (
f"\n# {title}\n## Article #{group['article_number']}\n{user_notes_string}{docs}\n---\n"
)
prompt = get_chat_prompt(user_input, content_string=chunks_string, role=self.chat.role)
return self.chatbot.generate(prompt, stream=True)
def run(self):
# Base Bot has no Streamlit run loop
pass
def get_notes(self):
# Minimal note retrieval
notes = self.user_arango.db.aql.execute(
f'FOR doc IN notes FILTER doc.project == "{self.project.name if self.project else ""}" RETURN doc'
)
return list(notes)
@ToolRegistry.register
def fetch_science_articles_tool(self, query: str, n_documents: int):
"""
"Fetches information from scientific articles. Use this tool when the user is looking for information from scientific articles."
Parameters:
query (str): The search query to find relevant scientific articles.
n_documents (int): How many documents to fetch. A complex query may require more documents. Min: 3, Max: 10.
Returns:
list: A list of chunks containing information from the fetched scientific articles.
"""
print_purple('Query:', query)
n_documents = int(n_documents)
if n_documents < 3:
n_documents = 3
elif n_documents > 10:
n_documents = 10
return self.get_chunks(
query, collections=["sci_articles"], n_results=n_documents
)
@ToolRegistry.register
def fetch_other_documents_tool(self, query: str, n_documents: int):
"""
Fetches information from other documents based on the user's query.
This method retrieves information from various types of documents such as reports, news articles, and other texts. It should be used only when it is clear that the user is not seeking scientific articles.
Args:
query (str): The search query provided by the user.
n_documents (int): How many documents to fetch. A complex query may require more documents. Min: 2, Max: 10.
Returns:
list: A list of document chunks that match the query.
"""
assert isinstance(self, Bot), "The first argument must be a Bot object."
n_documents = int(n_documents)
if n_documents < 2:
n_documents = 2
elif n_documents > 10:
n_documents = 10
return self.get_chunks(
query,
collections=[f"{self.username}__other_documents"],
n_results=n_documents,
)
@ToolRegistry.register
def fetch_science_articles_and_other_documents_tool(
self, query: str, n_documents: int
):
"""
Fetches information from both scientific articles and other documents.
This method is often used when the user hasn't specified what kind of sources they are interested in.
Args:
query (str): The search query to fetch information for.
n_documents (int): How many documents to fetch. A complex query may require more documents. Min: 3, Max: 10.
Returns:
list: A list of document chunks that match the search query.
"""
assert isinstance(self, Bot), "The first argument must be a Bot object."
n_documents = int(n_documents)
if n_documents < 3:
n_documents = 3
elif n_documents > 10:
n_documents = 10
return self.get_chunks(
query,
collections=["sci_articles", f"{self.username}__other_documents"],
n_results=n_documents,
)
@ToolRegistry.register
def fetch_notes_tool(bot):
"""
Fetches information from the project notes when you as an editor need context from the project notes to understand other information. ONLY use this together with other tools! No arguments needed.
Returns:
list: A list of notes.
"""
assert isinstance(bot, Bot), "The first argument must be a Bot object."
return bot.get_notes()
@ToolRegistry.register
def conversational_response_tool(self, query: str):
"""
Generate a conversational response to a user's query.
This method is designed to provide a short and conversational response
without fetching additional data. It should be used only when it is clear
that the user is engaging in small talk (like saying 'hi') and not seeking detailed information.
Args:
query (str): The user's message to which the bot should respond.
Returns:
str: The generated conversational response.
"""
query = f"""
User message: "{query}".
Make your answer short and conversational.
This is perhaps not a conversation about a journalistic project, so try not to be too informative.
Don't answer with anything you're not sure of!
"""
result = (
self.chatbot.generate(query, stream=True)
if self.chatbot
else self.llm.generate(query, stream=True)
)
return result
class StreamlitBot(Bot):
def __init__(self, username: str, chat: StreamlitChat = None, tools: list = None, **kwargs):
print_purple("StreamlitBot init chat:", chat)
super().__init__(username=username, chat=chat, tools=tools, **kwargs)
# For Streamlit, we can override or add attributes
if 'llm_chosen_backend' not in st.session_state:
st.session_state['llm_chosen_backend'] = None
self.chatbot.chosen_backend = st.session_state['llm_chosen_backend']
if not st.session_state['llm_chosen_backend']:
st.session_state['llm_chosen_backend'] = self.chatbot.chosen_backend
def run(self):
# Example Streamlit run loop
self.chat.show_chat_history()
if user_input := st.chat_input("Write your message here...", accept_file=True):
text_input = user_input.text.replace('"""', "---")
if len(user_input.files) > 1:
st.error("Please upload only one file at a time.")
return
attached_file = user_input.files[0] if user_input.files else None
content_attachment = None
if attached_file:
if attached_file.type == "application/pdf":
import fitz
pdf_document = fitz.open(stream=attached_file.read(), filetype="pdf")
pdf_text = ""
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
pdf_text += page.get_text()
content_attachment = pdf_text
elif attached_file.type in ["image/png", "image/jpeg"]:
self.chat.message_attachments = "image"
content_attachment = attached_file.read()
with st.chat_message("user", avatar=self.chat.get_avatar(role="user")):
st.image(content_attachment)
with st.chat_message("user", avatar=self.chat.get_avatar(role="user")):
st.write(text_input)
if not self.chat.name:
self.chat.set_name(text_input)
self.chat.last_updated = datetime.now().isoformat()
self.chat.saved = False
self.user_arango.db.collection("chats").insert(
self.chat.to_dict(), overwrite=True, overwrite_mode="update"
)
self.process_user_input(text_input, content_attachment)
def process_user_input(self, user_input, content_attachment=None):
# We override to show messages in Streamlit instead of just storing
self.chat.add_message("user", user_input)
if not content_attachment:
prompt = get_tools_prompt(user_input)
response = self.toolbot.generate(prompt, tools=self.tools, stream=False)
if response.get("tool_calls"):
bot_response = self.answer_tool_call(response, user_input)
else:
bot_response = response.content.strip('"')
with st.chat_message("assistant", avatar=self.chat.get_avatar(role="assistant")):
st.write(bot_response)
else:
with st.chat_message("assistant", avatar=self.chat.get_avatar(role="assistant")):
with st.spinner("Reading the content..."):
if self.chat.message_attachments == "image":
prompt = get_chat_prompt(user_input, role=self.chat.role, image_attachment=True)
bot_resp = self.chatbot.generate(prompt, stream=False, images=[content_attachment], model="vision")
st.write(bot_resp)
bot_response = bot_resp
else:
prompt = get_chat_prompt(user_input, content_attachment=content_attachment, role=self.chat.role)
response = self.chatbot.generate(prompt, stream=True)
bot_response = st.write_stream(response)
if self.chat.chat_history[-1]["role"] != "assistant":
self.chat.add_message("assistant", bot_response)
self.chat.update_in_arango()
def answer_tool_call(self, response, user_input):
bot_responses = []
for tool in response.get("tool_calls", []):
function_name = tool.function.get('name')
arguments = tool.function.arguments
arguments["query"] = user_input
with st.chat_message("assistant", avatar=self.chat.get_avatar(role="assistant")):
if function_name in [
"fetch_other_documents_tool",
"fetch_science_articles_tool",
"fetch_science_articles_and_other_documents_tool",
]:
chunks = getattr(self, function_name)(**arguments)
response_text = self.generate_from_chunks(user_input, chunks)
bot_response = st.write_stream(response_text).strip('"')
if chunks:
sources = "###### Sources:\n"
for title, group in chunks.items():
j = group["chunks"][0]["metadata"].get("journal", "No Journal")
d = group["chunks"][0]["metadata"].get("published_date", "No Date")
sources += f"[{group['article_number']}] **{title}** :gray[{j} ({d})]\n"
st.markdown(sources)
bot_response += f"\n\n{sources}"
bot_responses.append(bot_response)
elif function_name == "fetch_notes_tool":
notes = getattr(self, function_name)()
response_text = self.generate_from_notes(user_input, notes)
bot_responses.append(st.write_stream(response_text).strip('"'))
elif function_name == "conversational_response_tool":
response_text = getattr(self, function_name)(user_input)
bot_responses.append(st.write_stream(response_text).strip('"'))
return "\n\n".join(bot_responses)
def generate_from_notes(self, user_input, notes):
with st.spinner("Reading project notes..."):
return super().generate_from_notes(user_input, notes)
def generate_from_chunks(self, user_input, chunks):
# For reading articles with a spinner
magazines = set()
for group in chunks.values():
j = group["chunks"][0]["metadata"].get("journal", "No Journal")
magazines.add(f"*{j}*")
s = (
f"Reading articles from {', '.join(list(magazines)[:-1])} and {list(magazines)[-1]}..."
if len(magazines) > 1
else "Reading articles..."
)
with st.spinner(s):
return super().generate_from_chunks(user_input, chunks)
def sidebar_content(self):
with st.sidebar:
st.write("---")
st.markdown(f'#### {self.chat.name if self.chat.name else ""}')
st.button("Delete this chat", on_click=self.delete_chat)
def delete_chat(self):
self.user_arango.db.collection("chats").delete_match(
filters={"name": self.chat.name}
)
self.chat = Chat()
def get_notes(self):
# We can show a spinner or messages too
with st.spinner("Fetching notes..."):
return super().get_notes()
class EditorBot(StreamlitBot(Bot)):
def __init__(self, chat: Chat, username: str, **kwargs):
print_blue("EditorBot init chat:", chat)
super().__init__(chat=chat, username=username, **kwargs)
self.role = "Editor"
self.tools = ToolRegistry.get_tools()
self.chatbot = LLM(
system_message=get_editor_prompt(kwargs.get("project")),
messages=self.chat.chat_history2bot(),
chosen_backend=kwargs.get("chosen_backend"),
)
class ResearchAssistantBot(StreamlitBot(Bot)):
def __init__(self, chat: Chat, username: str, **kwargs):
super().__init__(chat=chat, username=username, **kwargs)
self.role = "Research Assistant"
self.chatbot = LLM(
system_message=get_assistant_prompt(),
temperature=0.1,
messages=self.chat.chat_history2bot(),
)
self.tools = [
self.fetch_science_articles_tool,
self.fetch_science_articles_and_other_documents_tool,
]
class PodBot(StreamlitBot(Bot)):
"""Two LLM agents construct a conversation using material from science articles."""
def __init__(
self,
chat: Chat,
subject: str,
username: str,
instructions: str = None,
**kwargs,
):
super().__init__(chat=chat, username=username, **kwargs)
self.subject = subject
self.instructions = instructions
self.guest_name = kwargs.get("name_guest", "Merit")
self.hostbot = HostBot(
Chat(username=self.username, role="Host"),
subject,
username,
instructions=instructions,
**kwargs,
)
self.guestbot = GuestBot(
Chat(username=self.username, role="Guest"),
subject,
username,
name_guest=self.guest_name,
**kwargs,
)
def run(self):
notes = self.get_notes()
notes_string = ""
if self.instructions:
instructions_string = f'''
These are the instructions for the podcast from the producer:
"""
{self.instructions}
"""
'''
else:
instructions_string = ""
for note in notes:
notes_string += f"\n# {note['title']}\n{note['content']}\n---\n"
a = f'''You will make a podcast interview with {self.guest_name}, an expert on "{self.subject}".
{instructions_string}
Below are notes on the subject that you can use to ask relevant questions:
"""
{notes_string}
"""
Say hello to the expert and start the interview. Remember to keep the interview to the subject of {self.subject} throughout the conversation.
'''
# Stop button for the podcast
with st.sidebar:
stop = st.button("Stop podcast", on_click=self.stop_podcast)
while st.session_state["make_podcast"]:
# Stop the podcast if there are more than 14 messages in the chat
self.chat.show_chat_history()
if len(self.chat.chat_history) == 14:
result = self.hostbot.generate(
"The interview has ended. Say thank you to the expert and end the conversation."
)
self.chat.add_message("Host", result)
with st.chat_message(
"assistant", avatar=self.chat.get_avatar(role="assistant")
):
st.write(result.strip('"'))
st.stop()
_q = self.hostbot.toolbot.generate(
query=f"{self.guest_name} has answered: {a}. You have to choose a tool to help the host continue the interview.",
tools=self.hostbot.tools,
temperature=0.6,
stream=False,
)
if "tool_calls" in _q:
q = self.hostbot.answer_tool_call(_q, a)
else:
q = _q
self.chat.add_message("Host", q)
_a = self.guestbot.toolbot.generate(
f'The podcast host has asked: "{q}" Choose a tool to help the expert answer with relevant facts and information.',
tools=self.guestbot.tools,
)
if "tool_calls" in _a:
print_yellow("Tool call response (guest)", _a)
print_yellow(self.guestbot.chat.role)
a = self.guestbot.answer_tool_call(_a, q)
else:
a = _a
self.chat.add_message("Guest", a)
self.update_session_state()
def stop_podcast(self):
st.session_state["make_podcast"] = False
self.update_session_state()
self.chat.show_chat_history()
class HostBot(StreamlitBot(Bot)):
def __init__(
self, chat: Chat, subject: str, username: str, instructions: str, **kwargs
):
super().__init__(chat=chat, username=username, **kwargs)
self.chat.role = kwargs.get("role", "Host")
self.tools = ToolRegistry.get_tools(
tools=[
self.fetch_notes_tool,
self.conversational_response_tool,
# "fetch_other_documents", #TODO Should this be included?
]
)
self.instructions = instructions
self.llm = LLM(
system_message=f'''
You are the host of a podcast and an expert on {subject}. You will ask one question at a time about the subject, and then wait for the guest to answer.
Don't ask the guest to talk about herself/himself, only about the subject.
Make your questions short and clear, only if necessary add a brief context to the question.
These are the instructions for the podcast from the producer:
"""
{self.instructions}
"""
If the experts' answer is complicated, try to make a very brief summary of it for the audience to understand. You can also ask follow-up questions to clarify the answer, or ask for examples.
''',
messages=self.chat.chat_history2bot()
)
self.toolbot = LLM(
temperature=0,
system_message="""
You are assisting a podcast host in asking questions to an expert.
Choose one or many tools to use in order to assist the host in asking relevant questions.
Often "conversational_response_tool" is enough, but sometimes project notes are needed.
Make sure to read the description of the tools carefully!""",
chat=False,
model="small",
)
def generate(self, query):
return self.llm.generate(query)
class GuestBot(StreamlitBot(Bot)):
def __init__(self, chat: Chat, subject: str, username: str, **kwargs):
super().__init__(chat=chat, username=username, **kwargs)
self.chat.role = kwargs.get("role", "Guest")
self.tools = ToolRegistry.get_tools(
tools=[
self.fetch_notes_tool,
self.fetch_science_articles_tool,
]
)
self.llm = LLM(
system_message=f"""
You are {kwargs.get('name', 'Merit')}, an expert on {subject}.
Today you are a guest in a podcast about {subject}. A host will ask you questions about the subject and you will answer by using scientific facts and information.
When answering, don't say things like "based on the documents" or alike, as neither the host nor the audience can see the documents. Act just as if you were talking to someone in a conversation.
Try to be concise when answering, and remember that the audience of the podcast is not expert on the subject, so don't complicate things too much.
It's very important that you answer in a "spoken" way, as if you were talking to someone in a conversation. That means you should avoid using scientific jargon and complex terms, too many figures or abstract concepts.
Lists are also not recommended, instead use "for the first reason", "secondly", etc.
Instead, use "..." to indicate a pause, "-" to indicate a break in the sentence, as if you were speaking.
""",
messages=self.chat.chat_history2bot()
)
self.toolbot = LLM(
temperature=0,
system_message=f"You are an assistant to an expert on {subject}. Choose one or many tools to use in order to assist the expert in answering questions. Make sure to read the description of the tools carefully.",
chat=False,
model="small",
)
def generate(self, query):
return self.llm.generate(query)

@ -0,0 +1,497 @@
from datetime import datetime
import streamlit as st
import uuid
from _base_class import StreamlitBaseClass, BaseClass
from _llm import LLM
from _arango import ArangoDB
from prompts import *
from colorprinter.print_color import *
from llm_tools import ToolRegistry
from streamlit_chatbot import StreamlitBot, PodBot, EditorBot, ResearchAssistantBot
class Chat(StreamlitBaseClass):
def __init__(self, username=None, **kwargs):
super().__init__(username=username, **kwargs)
self.name = kwargs.get("name", None)
self.chat_history = kwargs.get("chat_history", [])
self.role = kwargs.get("role", "Research Assistant")
self._key = kwargs.get("_key", str(uuid.uuid4()))
self.saved = kwargs.get("saved", False)
self.last_updated = kwargs.get("last_updated", datetime.now().isoformat())
self.message_attachments = None
self.project = kwargs.get("project", None)
def add_message(self, role, content):
self.chat_history.append(
{
"role": role,
"content": content.strip().strip('"'),
"role_type": self.role,
}
)
def to_dict(self):
return {
"_key": self._key,
"name": self.name,
"chat_history": self.chat_history,
"role": self.role,
"username": self.username,
"project": self.project,
"last_updated": self.last_updated,
"saved": self.saved,
}
def update_in_arango(self):
"""Update chat in ArangoDB using the new API"""
self.last_updated = datetime.now().isoformat()
# Use the create_or_update_chat method from the new API
self.user_arango.create_or_update_chat(self.to_dict())
def set_name(self, user_input):
llm = LLM(
model="small",
max_length_answer=50,
temperature=0.4,
system_message="You are a chatbot who will be chatting with a user",
)
prompt = (
f'Give a short name to the chat based on this user input: "{user_input}" '
"No more than 30 characters. Answer ONLY with the name of the chat."
)
name = llm.generate(prompt).content.strip('"')
name = f'{name} - {datetime.now().strftime("%B %d")}'
# Check for existing chat with the same name
existing_chat = self.user_arango.execute_aql(
"""
FOR chat IN chats
FILTER chat.name == @name AND chat.username == @username
RETURN chat
""",
bind_vars={"name": name, "username": self.username}
)
if list(existing_chat):
name = f'{name} ({datetime.now().strftime("%H:%M")})'
name += f" - [{self.role}]"
self.name = name
return name
def show_chat_history(self):
"""Display chat history in the Streamlit UI"""
for message in self.chat_history:
with st.chat_message(
name="assistant" if message["role"] == "assistant" else "user",
avatar=self.get_avatar(role=message["role"])
):
st.write(message["content"])
def get_avatar(self, role):
"""Get avatar for a role"""
if role == "user":
return None
elif role == "Host":
return "🎙"
elif role == "Guest":
return "🎤"
elif role == "assistant":
if self.role == "Research Assistant":
return "🔬"
elif self.role == "Editor":
return "📝"
else:
return "🤖"
return None
@classmethod
def from_dict(cls, data):
return cls(
username=data.get("username"),
name=data.get("name"),
chat_history=data.get("chat_history", []),
role=data.get("role", "Research Assistant"),
_key=data.get("_key"),
project=data.get("project"),
last_updated=data.get("last_updated"),
saved=data.get("saved", False),
)
def chat_history2bot(self, n_messages: int = None, remove_system: bool = False):
history = [
{"role": m["role"], "content": m["content"]} for m in self.chat_history
]
if n_messages and len(history) > n_messages:
history = history[-n_messages:]
if (
all([history[0]["role"] == "system", remove_system])
or history[0]["role"] == "assistant"
):
history = history[1:]
return history
class Bot(BaseClass):
def __init__(self, username: str, chat: Chat = None, tools: list = None, **kwargs):
super().__init__(username=username, **kwargs)
# Use the passed in chat or create a new Chat
self.chat = chat if chat else Chat(username=username, role="Research Assistant")
print_yellow(f"Chat:", chat, type(chat))
# Store or set up project/collection if available
self.project = kwargs.get("project", None)
self.collection = kwargs.get("collection", None)
if self.collection and not isinstance(self.collection, list):
self.collection = [self.collection]
# Load articles in the collections using the new API
self.arango_ids = []
if self.collection:
for c in self.collection:
# Use execute_aql from the new API
article_ids = self.user_arango.execute_aql(
"""
FOR doc IN article_collections
FILTER doc.name == @collection
FOR article IN doc.articles
RETURN article
""",
bind_vars={"collection": c}
)
for _id in article_ids:
self.arango_ids.append(_id)
# A standard LLM for normal chat
self.chatbot = LLM(messages=self.chat.chat_history2bot())
# A helper bot for generating queries or short prompts
self.helperbot = LLM(
temperature=0,
model="small",
max_length_answer=500,
system_message=get_query_builder_system_message(),
messages=self.chat.chat_history2bot(n_messages=4, remove_system=True),
)
# A specialized LLM picking which tool to use
self.toolbot = LLM(
temperature=0,
system_message="""
You are an assistant bot helping an answering bot to answer a user's messages.
Your task is to choose one or multiple tools that will help the answering bot to provide the user with the best possible answer.
You should NEVER directly answer the user. You MUST choose a tool.
""",
chat=False,
model="small",
)
# Load or register the passed-in tools
if tools:
self.tools = ToolRegistry.get_tools(tools=tools)
else:
self.tools = ToolRegistry.get_tools()
# Store other kwargs
for arg in kwargs:
setattr(self, arg, kwargs[arg])
def get_chunks(
self,
user_input,
collections=["sci_articles", "other_documents"],
n_results=7,
n_sources=4,
filter=True,
):
# Basic version without Streamlit calls
query = self.helperbot.generate(
get_generate_vector_query_prompt(user_input, self.chat.role)
).content.strip('"')
combined_chunks = []
if collections:
for collection in collections:
where_filter = {"_id": {"$in": self.arango_ids}} if filter else {}
chunks = self.get_chromadb().query(
query=query,
collection=collection,
n_results=n_results,
n_sources=n_sources,
where=where_filter,
max_retries=3,
)
for doc, meta, dist in zip(
chunks["documents"][0],
chunks["metadatas"][0],
chunks["distances"][0],
):
combined_chunks.append(
{"document": doc, "metadata": meta, "distance": dist}
)
combined_chunks.sort(key=lambda x: x["distance"])
# Keep the best chunks according to n_sources
sources = set()
closest_chunks = []
for chunk in combined_chunks:
source_id = chunk["metadata"].get("_id", "no_id")
if source_id not in sources:
sources.add(source_id)
closest_chunks.append(chunk)
if len(sources) >= n_sources:
break
if len(closest_chunks) < n_results:
remaining_chunks = [
c for c in combined_chunks if c not in closest_chunks
]
closest_chunks.extend(remaining_chunks[: n_results - len(closest_chunks)])
# Now fetch real metadata from Arango using the new API
for chunk in closest_chunks:
_id = chunk["metadata"].get("_id")
if not _id:
continue
try:
# Determine which database to use based on collection name
if _id.startswith("sci_articles"):
# Use base_arango for common documents
arango_doc = self.base_arango.get_document(_id)
else:
# Use user_arango for user-specific documents
arango_doc = self.user_arango.get_document(_id)
if arango_doc:
arango_metadata = arango_doc.get("metadata", {})
# Possibly merge notes
if "user_notes" in arango_doc:
arango_metadata["user_notes"] = arango_doc["user_notes"]
chunk["metadata"] = arango_metadata
except Exception as e:
print_red(f"Error fetching document {_id}: {e}")
# Group by article title
grouped_chunks = {}
article_number = 1
for chunk in closest_chunks:
title = chunk["metadata"].get("title", "No title")
chunk["article_number"] = article_number
if title not in grouped_chunks:
grouped_chunks[title] = {
"article_number": article_number,
"chunks": [],
}
article_number += 1
grouped_chunks[title]["chunks"].append(chunk)
return grouped_chunks
def answer_tool_call(self, response, user_input):
bot_responses = []
# This method returns / stores responses (no Streamlit calls)
if not response.get("tool_calls"):
return ""
for tool in response.get("tool_calls"):
function_name = tool.function.get('name')
arguments = tool.function.arguments
arguments["query"] = user_input
if hasattr(self, function_name):
if function_name in [
"fetch_other_documents_tool",
"fetch_science_articles_tool",
"fetch_science_articles_and_other_documents_tool",
]:
chunks = getattr(self, function_name)(**arguments)
bot_responses.append(
self.generate_from_chunks(user_input, chunks).strip('"')
)
elif function_name == "fetch_notes_tool":
notes = getattr(self, function_name)()
bot_responses.append(
self.generate_from_notes(user_input, notes).strip('"')
)
elif function_name == "conversational_response_tool":
bot_responses.append(
getattr(self, function_name)(user_input).strip('"')
)
return "\n\n".join(bot_responses)
def process_user_input(self, user_input, content_attachment=None):
# Add user message
self.chat.add_message("user", user_input)
if not content_attachment:
prompt = get_tools_prompt(user_input)
response = self.toolbot.generate(prompt, tools=self.tools, stream=False)
if response.get("tool_calls"):
bot_response = self.answer_tool_call(response, user_input)
else:
# Just respond directly
bot_response = response.content.strip('"')
else:
# If there's an attachment, do something minimal
bot_response = "Content attachment received (Base Bot)."
# Add assistant message
if self.chat.chat_history[-1]["role"] != "assistant":
self.chat.add_message("assistant", bot_response)
# Update in Arango
self.chat.update_in_arango()
return bot_response
def generate_from_notes(self, user_input, notes):
# No Streamlit calls
notes_string = ""
for note in notes:
notes_string += f"\n# {note.get('title','No title')}\n{note.get('text','')}\n---\n"
prompt = get_chat_prompt(user_input, content_string=notes_string, role=self.chat.role)
return self.chatbot.generate(prompt, stream=True)
def generate_from_chunks(self, user_input, chunks):
# No Streamlit calls
chunks_string = ""
for title, group in chunks.items():
user_notes_string = ""
if "user_notes" in group["chunks"][0]["metadata"]:
notes = group["chunks"][0]["metadata"]["user_notes"]
user_notes_string = f'\n\nUser notes:\n"""\n{notes}\n"""\n\n'
docs = "\n(...)\n".join([c["document"] for c in group["chunks"]])
chunks_string += (
f"\n# {title}\n## Article #{group['article_number']}\n{user_notes_string}{docs}\n---\n"
)
prompt = get_chat_prompt(user_input, content_string=chunks_string, role=self.chat.role)
return self.chatbot.generate(prompt, stream=True)
def run(self):
# Base Bot has no Streamlit run loop
pass
def get_notes(self):
# Get project notes using the new API
if self.project and hasattr(self.project, "name"):
notes = self.user_arango.get_project_notes(
project_name=self.project.name,
username=self.username
)
return list(notes)
return []
@ToolRegistry.register
def fetch_science_articles_tool(self, query: str, n_documents: int):
"""
"Fetches information from scientific articles. Use this tool when the user is looking for information from scientific articles."
Parameters:
query (str): The search query to find relevant scientific articles.
n_documents (int): How many documents to fetch. A complex query may require more documents. Min: 3, Max: 10.
Returns:
list: A list of chunks containing information from the fetched scientific articles.
"""
print_purple('Query:', query)
n_documents = int(n_documents)
if n_documents < 3:
n_documents = 3
elif n_documents > 10:
n_documents = 10
return self.get_chunks(
query, collections=["sci_articles"], n_results=n_documents
)
@ToolRegistry.register
def fetch_other_documents_tool(self, query: str, n_documents: int):
"""
Fetches information from other documents based on the user's query.
This method retrieves information from various types of documents such as reports, news articles, and other texts. It should be used only when it is clear that the user is not seeking scientific articles.
Args:
query (str): The search query provided by the user.
n_documents (int): How many documents to fetch. A complex query may require more documents. Min: 2, Max: 10.
Returns:
list: A list of document chunks that match the query.
"""
assert isinstance(self, Bot), "The first argument must be a Bot object."
n_documents = int(n_documents)
if n_documents < 2:
n_documents = 2
elif n_documents > 10:
n_documents = 10
return self.get_chunks(
query,
collections=[f"{self.username}__other_documents"],
n_results=n_documents,
)
@ToolRegistry.register
def fetch_science_articles_and_other_documents_tool(
self, query: str, n_documents: int
):
"""
Fetches information from both scientific articles and other documents.
This method is often used when the user hasn't specified what kind of sources they are interested in.
Args:
query (str): The search query to fetch information for.
n_documents (int): How many documents to fetch. A complex query may require more documents. Min: 3, Max: 10.
Returns:
list: A list of document chunks that match the search query.
"""
assert isinstance(self, Bot), "The first argument must be a Bot object."
n_documents = int(n_documents)
if n_documents < 3:
n_documents = 3
elif n_documents > 10:
n_documents = 10
return self.get_chunks(
query,
collections=["sci_articles", f"{self.username}__other_documents"],
n_results=n_documents,
)
@ToolRegistry.register
def fetch_notes_tool(bot):
"""
Fetches information from the project notes when you as an editor need context from the project notes to understand other information. ONLY use this together with other tools! No arguments needed.
Returns:
list: A list of notes.
"""
assert isinstance(bot, Bot), "The first argument must be a Bot object."
return bot.get_notes()
@ToolRegistry.register
def conversational_response_tool(self, query: str):
"""
Generate a conversational response to a user's query.
This method is designed to provide a short and conversational response
without fetching additional data. It should be used only when it is clear
that the user is engaging in small talk (like saying 'hi') and not seeking detailed information.
Args:
query (str): The user's message to which the bot should respond.
Returns:
str: The generated conversational response.
"""
query = f"""
User message: "{query}".
Make your answer short and conversational.
This is perhaps not a conversation about a journalistic project, so try not to be too informative.
Don't answer with anything you're not sure of!
"""
result = (
self.chatbot.generate(query, stream=True)
if self.chatbot
else self.llm.generate(query, stream=True)
)
return result

@ -1,8 +1,13 @@
import chromadb
import os
from typing import Union, List, Dict, Tuple, Any, Union
import re
from chromadb.config import Settings
from dotenv import load_dotenv
from colorprinter.print_color import *
from models import ChunkSearchResults
load_dotenv(".env")
@ -20,6 +25,7 @@ class ChromaDB:
)
self.db = chromadb.HttpClient(
host=host,
#database=db,
settings=Settings(
chroma_client_auth_provider="chromadb.auth.token_authn.TokenAuthClientProvider",
chroma_client_auth_credentials=credentials,
@ -63,14 +69,20 @@ class ChromaDB:
col = self.db.get_collection(collection)
sources = []
n = 0
print('Collection', collection)
result = {"ids": [[]], "metadatas": [[]], "documents": [[]], "distances": [[]]}
while True:
n += 1
if n > max_retries:
break
if where == {}:
where = None
where = None
print_rainbow(kwargs)
print('N_results:', n_results)
print('Sources:', sources)
print('Query:', query)
r = col.query(
query_texts=query,
n_results=n_results - len(sources),
@ -79,6 +91,7 @@ class ChromaDB:
)
if r["ids"][0] == []:
if result["ids"][0] == []:
print_rainbow(r)
print_red("No results found in vector database.")
else:
print_red("No more results found in vector database.")
@ -123,6 +136,210 @@ class ChromaDB:
break
return result
def search(
self,
query: str,
collection: str,
n_results: int = 6,
n_sources: int = 3,
where: dict = None,
format_results: bool = False,
**kwargs,
) -> Union[dict, ChunkSearchResults]:
"""
An enhanced search method that provides a cleaner interface for querying and processing results.
Args:
query (str): The search query
collection (str): Collection name to search in
n_results (int): Maximum number of results to return
n_sources (int): Maximum number of unique sources to include
where (dict, optional): Additional filtering criteria
format_results (bool): Whether to return formatted ChunkSearchResults
**kwargs: Additional arguments to pass to the query
Returns:
List[dict]: List of dictionaries containing the search results
"""
# Get raw query results with existing query method
result = self.query(
query=query,
collection=collection,
n_results=n_results,
n_sources=n_sources,
where=where,
**kwargs,
)
# If no formatting requested, return raw results
if not format_results:
return result
# Process results into dictionary format
combined_chunks = []
for doc, meta, dist, _id in zip(
result["documents"][0],
result["metadatas"][0],
result["distances"][0],
result["ids"][0],
):
combined_chunks.append(
{"document": doc, "metadata": meta, "distance": dist, "id": _id}
)
return combined_chunks
def clean_result_text(self, documents: list) -> list:
"""
Clean text in document results by removing footnote references.
Args:
documents (list): List of document dictionaries
Returns:
list: Documents with cleaned text
"""
import re
for doc in documents:
if "document" in doc:
doc["document"] = re.sub(r"\[\d+\]", "", doc["document"])
return documents
def filter_by_unique_sources(
self, results: list, n_sources: int, source_key: str = "_id"
) -> Tuple[List, List]:
"""
Filters search results to keep only a specified number of unique sources.
Args:
results (list): List of documents from search
n_sources (int): Maximum number of unique sources to include
source_key (str): The key in metadata that identifies the source
Returns:
tuple: (filtered_results, remaining_results)
"""
sources = set()
filtered_results = []
remaining_results = []
for item in results:
source_id = item["metadata"].get(source_key, "no_id")
if source_id not in sources and len(sources) < n_sources:
sources.add(source_id)
filtered_results.append(item)
else:
remaining_results.append(item)
return filtered_results, remaining_results
def backfill_results(
self, filtered_results: list, remaining_results: list, n_results: int
) -> list:
"""
Adds additional results from remaining_results to filtered_results
until n_results is reached.
Args:
filtered_results (list): Initial filtered results
remaining_results (list): Other results that can be added
n_results (int): Target number of total results
Returns:
list: Combined results up to n_results
"""
if len(filtered_results) >= n_results:
return filtered_results[:n_results]
needed = n_results - len(filtered_results)
return filtered_results + remaining_results[:needed]
def search_chunks(
self,
query: str,
collections: List[str],
n_results: int = 7,
n_sources: int = 4,
where: dict = None,
**kwargs,
) -> ChunkSearchResults:
"""
Complete pipeline for processing chunks: search, filter, clean, and format.
Args:
query (str): The search query
collections (List[str]): List of collection names to search
n_results (int): Maximum number of results to return
n_sources (int): Maximum number of unique sources to include
where (dict, optional): Additional filtering criteria
**kwargs: Additional arguments to pass to search
Returns:
ChunkSearchResults: Processed chunks with Chroma IDs
"""
combined_chunks = []
if isinstance(collections, str):
collections = [collections]
# Search all collections
for collection in collections:
chunks = self.search(
query=query,
collection=collection,
n_results=n_results,
n_sources=n_sources,
where=where,
format_results=True,
**kwargs,
)
for chunk in chunks:
combined_chunks.append({
"document": chunk["document"],
"metadata": chunk["metadata"],
"distance": chunk["distance"],
"id": chunk["id"],
})
# Sort and filter results
combined_chunks.sort(key=lambda x: x["distance"])
# Filter by unique sources and backfill
closest_chunks, remaining_chunks = self.filter_by_unique_sources(
combined_chunks, n_sources
)
closest_chunks = self.backfill_results(
closest_chunks, remaining_chunks, n_results
)
# Clean text
closest_chunks = self.clean_result_text(closest_chunks)
return closest_chunks
def add_document(self, _id, collection: str, document: str, metadata: dict = None):
"""
Adds a single document to a specified collection in the database.
Args:
_id (str): Arango ID for the document, used as a unique identifier.
collection (str): The name of the collection to add the document to.
document (str): The document text to be added.
metadata (dict, optional): Metadata to be associated with the document. Defaults to None.
Returns:
None
"""
col = self.db.get_or_create_collection(collection)
if metadata is None:
metadata = {}
col.add(ids=[_id], documents=[document], metadatas=[metadata])
def add_chunks(self, collection: str, chunks: list, _key, metadata: dict = None):
"""
Adds chunks to a specified collection in the database.
@ -148,18 +365,88 @@ class ChromaDB:
ids.append(f"{_key}_{number}")
col.add(ids=ids, metadatas=metadatas, documents=chunks)
def get_collection(self, collection: str) -> chromadb.Collection:
"""
Retrieves a collection from the database.
Args:
collection (str): The name of the collection to retrieve.
Returns:
chromadb.Collection: The requested collection.
"""
return self.db.get_or_create_collection(collection)
def is_reference_chunk(text: str) -> bool:
"""
Determine if a text chunk primarily consists of academic references.
Args:
text (str): Text chunk to analyze
Returns:
bool: True if the chunk appears to be mainly references
"""
# Count significant reference indicators
indicators = 0
# Check for DOI links (very strong indicator)
doi_matches = len(re.findall(r'https?://doi\.org/10\.\d+/\S+', text))
if doi_matches >= 2: # Multiple DOIs almost certainly means references
return True
elif doi_matches == 1:
indicators += 3
# Check for citation patterns with year, volume, pages (e.g., 2018;178:551–60)
citation_patterns = len(re.findall(r'\d{4};\d+:\d+[-–]\d+', text))
indicators += citation_patterns * 2
# Check for year patterns in brackets [YYYY]
year_brackets = len(re.findall(r'\[\d{4}\]', text))
indicators += year_brackets
# Check for multiple lines starting with author name patterns
lines = [line.strip() for line in text.split('\n') if line.strip()]
author_started_lines = 0
for line in lines:
# Common pattern in references: starts with Author Name(s)
if re.match(r'^\s*[A-Z][a-z]+\s+[A-Z][a-z]+', line):
author_started_lines += 1
# If multiple lines start with author names (common in reference lists)
if author_started_lines >= 2:
indicators += 2
# Check for academic reference terms
if re.search(r'\bet al\b|\bet al\.\b', text, re.IGNORECASE):
indicators += 1
# Return True if we have sufficient indicators
return indicators >= 4 # Adjust threshold as needed
if __name__ == "__main__":
from colorprinter.print_color import *
chroma = ChromaDB()
print(chroma.db.list_collections())
exit()
result = chroma.query(
print('DB', chroma.db.database)
print('SETTINGS', chroma.db.get_version())
result = chroma.search_chunks(
query="What is Open Science)",
collection="sci_articles",
collections="lasse__other_documents",
n_results=2,
n_sources=3,
max_retries=4,
)
print_rainbow(result["metadatas"][0])
collection = chroma.db.get_or_create_collection("lasse__other_documents")
result = collection.query(
query_texts="What is Open Science?",
n_results=2,
)
from pprint import pprint
pprint(result)
#print_rainbow(result["metadatas"][0])

@ -1,574 +0,0 @@
import os
import base64
import re
from typing import Literal, Optional
import requests
import tiktoken
from ollama import (
Client,
AsyncClient,
ResponseError,
ChatResponse,
Tool,
Options,
)
import env_manager
from colorprinter.print_color import *
env_manager.set_env()
tokenizer = tiktoken.get_encoding("cl100k_base")
class LLM:
"""
LLM class for interacting with an instance of Ollama.
Attributes:
model (str): The model to be used for response generation.
system_message (str): The system message to be used in the chat.
options (dict): Options for the model, such as temperature.
messages (list): List of messages in the chat.
max_length_answer (int): Maximum length of the generated answer.
chat (bool): Whether the chat mode is enabled.
chosen_backend (str): The chosen backend server for the API.
client (Client): The client for synchronous API calls.
async_client (AsyncClient): The client for asynchronous API calls.
tools (list): List of tools to be used in generating the response.
Methods:
__init__(self, system_message, temperature, model, max_length_answer, messages, chat, chosen_backend):
Initializes the LLM class with the provided parameters.
get_model(self, model_alias):
Retrieves the model name based on the provided alias.
count_tokens(self):
Counts the number of tokens in the messages.
get_least_conn_server(self):
Retrieves the least connected server from the backend.
generate(self, query, user_input, context, stream, tools, images, model, temperature):
Generates a response based on the provided query and options.
make_summary(self, text):
Generates a summary of the provided text.
read_stream(self, response):
Handles streaming responses.
async_generate(self, query, user_input, context, stream, tools, images, model, temperature):
Asynchronously generates a response based on the provided query and options.
prepare_images(self, images, message):
"""
def __init__(
self,
system_message: str = "You are an assistant.",
temperature: float = 0.01,
model: Optional[
Literal["small", "standard", "vision", "reasoning", "tools"]
] = "standard",
max_length_answer: int = 4096,
messages: list[dict] = None,
chat: bool = True,
chosen_backend: str = None,
tools: list = None,
) -> None:
"""
Initialize the assistant with the given parameters.
Args:
system_message (str): The initial system message for the assistant. Defaults to "You are an assistant.".
temperature (float): The temperature setting for the model, affecting randomness. Defaults to 0.01.
model (Optional[Literal["small", "standard", "vision", "reasoning"]]): The model type to use. Defaults to "standard".
max_length_answer (int): The maximum length of the generated answer. Defaults to 4096.
messages (list[dict], optional): A list of initial messages. Defaults to None.
chat (bool): Whether the assistant is in chat mode. Defaults to True.
chosen_backend (str, optional): The backend server to use. If not provided, the least connected server is chosen.
Returns:
None
"""
self.model = self.get_model(model)
self.call_model = (
self.model
) # This is set per call to decide what model that was actually used
self.system_message = system_message
self.options = {"temperature": temperature}
self.messages = messages or [{"role": "system", "content": self.system_message}]
self.max_length_answer = max_length_answer
self.chat = chat
if not chosen_backend:
chosen_backend = self.get_least_conn_server()
self.chosen_backend = chosen_backend
headers = {
"Authorization": f"Basic {self.get_credentials()}",
"X-Chosen-Backend": self.chosen_backend,
}
self.host_url = os.getenv("LLM_API_URL").rstrip("/api/chat/")
self.host_url = 'http://192.168.1.12:3300' #! Change back when possible
self.client: Client = Client(host=self.host_url, headers=headers, timeout=120)
self.async_client: AsyncClient = AsyncClient()
def get_credentials(self):
# Initialize the client with the host and default headers
credentials = f"{os.getenv('LLM_API_USER')}:{os.getenv('LLM_API_PWD_LASSE')}"
return base64.b64encode(credentials.encode()).decode()
def get_model(self, model_alias):
models = {
"standard": "LLM_MODEL",
"small": "LLM_MODEL_SMALL",
"vision": "LLM_MODEL_VISION",
"standard_64k": "LLM_MODEL_LARGE",
"reasoning": "LLM_MODEL_REASONING",
"tools": "LLM_MODEL_TOOLS",
}
model = os.getenv(models.get(model_alias, "LLM_MODEL"))
self.model = model
return model
def count_tokens(self):
num_tokens = 0
for i in self.messages:
for k, v in i.items():
if k == "content":
if not isinstance(v, str):
v = str(v)
tokens = tokenizer.encode(v)
num_tokens += len(tokens)
return int(num_tokens)
def get_least_conn_server(self):
try:
response = requests.get("http://192.168.1.12:5000/least_conn")
response.raise_for_status()
# Extract the least connected server from the response
least_conn_server = response.headers.get("X-Upstream-Address")
return least_conn_server
except requests.RequestException as e:
print_red("Error getting least connected server:", e)
return None
def generate(
self,
query: str = None,
user_input: str = None,
context: str = None,
stream: bool = False,
tools: list = None,
images: list = None,
model: Optional[
Literal["small", "standard", "vision", "reasoning", "tools"]
] = None,
temperature: float = None,
messages: list[dict] = None,
format = None,
think = False
):
"""
Generate a response based on the provided query and context.
Parameters:
query (str): The query string from the user.
user_input (str): Additional user input to be appended to the last message.
context (str): Contextual information to be used in generating the response.
stream (bool): Whether to stream the response.
tools (list): List of tools to be used in generating the response.
images (list): List of images to be included in the response.
model (Optional[Literal["small", "standard", "vision", "tools"]]): The model type to be used.
temperature (float): The temperature setting for the model.
messages (list[dict]): List of previous messages in the conversation.
format (Optional[BaseModel]): The format of the response.
think (bool): Whether to use the reasoning model.
Returns:
str: The generated response or an error message if an exception occurs.
"""
print_yellow(stream)
print_yellow("GENERATE")
# Prepare the model and temperature
model = self.get_model(model) if model else self.model
# if model == self.get_model('tools'):
# stream = False
temperature = temperature if temperature else self.options["temperature"]
if messages:
messages = [
{"role": i["role"], "content": re.sub(r"\s*\n\s*", "\n", i["content"])}
for i in messages
]
message = messages.pop(-1)
query = message["content"]
self.messages = messages
else:
# Normalize whitespace and add the query to the messages
query = re.sub(r"\s*\n\s*", "\n", query)
message = {"role": "user", "content": query}
# Handle images if any
if images:
message = self.prepare_images(images, message)
model = self.get_model("vision")
self.messages.append(message)
# Prepare headers
headers = {"Authorization": f"Basic {self.get_credentials()}"}
if self.chosen_backend and model not in [self.get_model("vision"), self.get_model("tools"), self.get_model("reasoning")]: #TODO Maybe reasoning shouldn't be here.
headers["X-Chosen-Backend"] = self.chosen_backend
if model == self.get_model("small"):
headers["X-Model-Type"] = "small"
if model == self.get_model("tools"):
headers["X-Model-Type"] = "tools"
reasoning_models = ['qwen3', 'deepseek'] #TODO Add more reasoning models here when added to ollama
if any([model_name in model for model_name in reasoning_models]):
if think:
query = f"/think\n{query}"
else:
query = f"/no_think\n{query}"
# Prepare options
options = Options(**self.options)
options.temperature = temperature
print_yellow("Stream the answer?", stream)
# Call the client.chat method
try:
self.call_model = model
self.client: Client = Client(host=self.host_url, headers=headers, timeout=300) #!
#print_rainbow(self.client._client.__dict__)
print_yellow("Model used in call:", model)
# if headers:
# self.client.headers.update(headers)
response = self.client.chat(
model=model,
messages=self.messages,
tools=tools,
stream=stream,
options=options,
keep_alive=3600 * 24 * 7,
format=format
)
except ResponseError as e:
print_red("Error!")
print(e)
return "An error occurred."
# print_rainbow(response.__dict__)
# If user_input is provided, update the last message
if user_input:
if context:
if len(context) > 2000:
context = self.make_summary(context)
user_input = (
f"{user_input}\n\nUse the information below to answer the question.\n"
f'"""{context}"""\n[This is a summary of the context provided in the original message.]'
)
system_message_info = "\nSometimes some of the messages in the chat history are summarised, then that is clearly indicated in the message."
if system_message_info not in self.messages[0]["content"]:
self.messages[0]["content"] += system_message_info
self.messages[-1] = {"role": "user", "content": user_input}
# self.chosen_backend = self.client.last_response.headers.get("X-Chosen-Backend")
# Handle streaming response
if stream:
print_purple("STREAMING")
return self.read_stream(response)
else:
print_purple("NOT STREAMING")
# Process the response
if isinstance(response, ChatResponse):
result = response.message.content.strip('"')
if '</think>' in result:
result = result.split('</think>')[-1]
self.messages.append(
{"role": "assistant", "content": result.strip('"')}
)
if tools and not response.message.get("tool_calls"):
print_yellow("No tool calls in response".upper())
if not self.chat:
self.messages = [self.messages[0]]
if not think:
response.message.content = remove_thinking(response.message.content)
return response.message
else:
print_red("Unexpected response type")
return "An error occurred."
def make_summary(self, text):
# Implement your summary logic using self.client.chat()
summary_message = {
"role": "user",
"content": f'Summarize the text below:\n"""{text}"""\nRemember to be concise and detailed. Answer in English.',
}
messages = [
{
"role": "system",
"content": "You are summarizing a text. Make it detailed and concise. Answer ONLY with the summary. Don't add any new information.",
},
summary_message,
]
try:
response = self.client.chat(
model=self.get_model("small"),
messages=messages,
options=Options(temperature=0.01),
keep_alive=3600 * 24 * 7,
)
summary = response.message.content.strip()
print_blue("Summary:", summary)
return summary
except ResponseError as e:
print_red("Error generating summary:", e)
return "Summary generation failed."
def read_stream(self, response):
"""
Yields tuples of (chunk_type, text). The first tuple is ('thinking', ...)
if in_thinking is True and stops at </think>. After that, yields ('normal', ...)
for the rest of the text.
"""
thinking_buffer = ""
in_thinking = self.call_model == self.get_model("reasoning")
first_chunk = True
prev_content = None
for chunk in response:
if not chunk:
continue
content = chunk.message.content
# Remove leading quote if it's the first chunk
if first_chunk and content.startswith('"'):
content = content[1:]
first_chunk = False
if in_thinking:
thinking_buffer += content
if "</think>" in thinking_buffer:
end_idx = thinking_buffer.index("</think>") + len("</think>")
yield ("thinking", thinking_buffer[:end_idx])
remaining = thinking_buffer[end_idx:].strip('"')
if chunk.done and remaining:
yield ("normal", remaining)
break
else:
prev_content = remaining
in_thinking = False
else:
if prev_content:
yield ("normal", prev_content)
prev_content = content
if chunk.done:
if prev_content and prev_content.endswith('"'):
prev_content = prev_content[:-1]
if prev_content:
yield ("normal", prev_content)
break
self.messages.append({"role": "assistant", "content": ""})
async def async_generate(
self,
query: str = None,
user_input: str = None,
context: str = None,
stream: bool = False,
tools: list = None,
images: list = None,
model: Optional[Literal["small", "standard", "vision"]] = None,
temperature: float = None,
):
"""
Asynchronously generates a response based on the provided query and other parameters.
Args:
query (str, optional): The query string to generate a response for.
user_input (str, optional): Additional user input to be included in the response.
context (str, optional): Context information to be used in generating the response.
stream (bool, optional): Whether to stream the response. Defaults to False.
tools (list, optional): List of tools to be used in generating the response. Will set the model to 'tools'.
images (list, optional): List of images to be included in the response.
model (Optional[Literal["small", "standard", "vision", "tools"]], optional): The model to be used for generating the response.
temperature (float, optional): The temperature setting for the model.
Returns:
str: The generated response or an error message if an exception occurs.
Raises:
ResponseError: If an error occurs during the response generation.
Notes:
- The function prepares the model and temperature settings.
- It normalizes whitespace in the query and handles images if provided.
- It prepares headers and options for the request.
- It adjusts options for long messages and calls the async client's chat method.
- If user_input is provided, it updates the last message.
- It updates the chosen backend based on the response headers.
- It handles streaming responses and processes the response accordingly.
- It's not neccecary to set model to 'tools' if you provide tools as an argument.
"""
print_yellow("ASYNC GENERATE")
# Normaliz e whitespace and add the query to the messages
query = re.sub(r"\s*\n\s*", "\n", query)
message = {"role": "user", "content": query}
self.messages.append(message)
# Prepare the model and temperature
model = self.get_model(model) if model else self.model
temperature = temperature if temperature else self.options["temperature"]
# Prepare options
options = Options(**self.options)
options.temperature = temperature
# Prepare headers
headers = {}
# Set model depending on the input
if images:
message = self.prepare_images(images, message)
model = self.get_model("vision")
elif tools:
model = self.get_model("tools")
headers["X-Model-Type"] = "tools"
tools = [Tool(**tool) if isinstance(tool, dict) else tool for tool in tools]
elif self.chosen_backend and model not in [self.get_model("vision"), self.get_model("tools"), self.get_model("reasoning")]:
headers["X-Chosen-Backend"] = self.chosen_backend
elif model == self.get_model("small"):
headers["X-Model-Type"] = "small"
# Adjust options for long messages
if self.chat or len(self.messages) > 15000:
num_tokens = self.count_tokens() + self.max_length_answer // 2
if num_tokens > 8000 and model not in [
self.get_model("vision"),
self.get_model("tools"),
]:
model = self.get_model("standard_64k")
headers["X-Model-Type"] = "large"
# Call the async client's chat method
try:
response = await self.async_client.chat(
model=model,
messages=self.messages,
headers=headers,
tools=tools,
stream=stream,
options=options,
keep_alive=3600 * 24 * 7,
)
except ResponseError as e:
print_red("Error!")
print(e)
return "An error occurred."
# If user_input is provided, update the last message
if user_input:
if context:
if len(context) > 2000:
context = self.make_summary(context)
user_input = (
f"{user_input}\n\nUse the information below to answer the question.\n"
f'"""{context}"""\n[This is a summary of the context provided in the original message.]'
)
system_message_info = "\nSometimes some of the messages in the chat history are summarised, then that is clearly indicated in the message."
if system_message_info not in self.messages[0]["content"]:
self.messages[0]["content"] += system_message_info
self.messages[-1] = {"role": "user", "content": user_input}
print_red(self.async_client.last_response.headers.get("X-Chosen-Backend", "No backend"))
# Update chosen_backend
if model not in [self.get_model("vision"), self.get_model("tools"), self.get_model("reasoning")]:
self.chosen_backend = self.async_client.last_response.headers.get(
"X-Chosen-Backend"
)
# Handle streaming response
if stream:
return self.read_stream(response)
else:
# Process the response
if isinstance(response, ChatResponse):
result = response.message.content.strip('"')
self.messages.append(
{"role": "assistant", "content": result.strip('"')}
)
if tools and not response.message.get("tool_calls"):
print_yellow("No tool calls in response".upper())
if not self.chat:
self.messages = [self.messages[0]]
return result
else:
print_red("Unexpected response type")
return "An error occurred."
def prepare_images(self, images, message):
"""
Prepares a list of images by converting them to base64 encoded strings and adds them to the provided message dictionary.
Args:
images (list): A list of images, where each image can be a file path (str), a base64 encoded string (str), or bytes.
message (dict): A dictionary to which the base64 encoded images will be added under the key "images".
Returns:
dict: The updated message dictionary with the base64 encoded images added under the key "images".
Raises:
ValueError: If an image is not a string or bytes.
"""
import base64
base64_images = []
base64_pattern = re.compile(r"^[A-Za-z0-9+/]+={0,2}$")
for image in images:
if isinstance(image, str):
if base64_pattern.match(image):
base64_images.append(image)
else:
with open(image, "rb") as image_file:
base64_images.append(
base64.b64encode(image_file.read()).decode("utf-8")
)
elif isinstance(image, bytes):
base64_images.append(base64.b64encode(image).decode("utf-8"))
else:
print_red("Invalid image type")
message["images"] = base64_images
# Use the vision model
return message
def remove_thinking(response):
"""Remove the thinking section from the response"""
response_text = response.content if hasattr(response, "content") else str(response)
if "</think>" in response_text:
return response_text.split("</think>")[1].strip()
return response_text
if __name__ == "__main__":
llm = LLM()
result = llm.generate(
query="I want to add 2 and 2",
)
print(result.content)

@ -0,0 +1,581 @@
from _llm import LLM
if __name__ == "__main__":
llm = LLM()
result = llm.generate(
query="I want to add 2 and 2",
think=True,
)
print(result)
# import os
# import base64
# import re
# from typing import Literal, Optional
# from pydantic import BaseModel
# import requests
# import tiktoken
# from ollama import (
# Client,
# AsyncClient,
# ResponseError,
# ChatResponse,
# Tool,
# Options,
# )
# import env_manager
# from colorprinter.print_color import *
# env_manager.set_env()
# tokenizer = tiktoken.get_encoding("cl100k_base")
# class LLM:
# """
# LLM class for interacting with an instance of Ollama.
# Attributes:
# model (str): The model to be used for response generation.
# system_message (str): The system message to be used in the chat.
# options (dict): Options for the model, such as temperature.
# messages (list): List of messages in the chat.
# max_length_answer (int): Maximum length of the generated answer.
# chat (bool): Whether the chat mode is enabled.
# chosen_backend (str): The chosen backend server for the API.
# client (Client): The client for synchronous API calls.
# async_client (AsyncClient): The client for asynchronous API calls.
# tools (list): List of tools to be used in generating the response.
# Methods:
# __init__(self, system_message, temperature, model, max_length_answer, messages, chat, chosen_backend):
# Initializes the LLM class with the provided parameters.
# get_model(self, model_alias):
# Retrieves the model name based on the provided alias.
# count_tokens(self):
# Counts the number of tokens in the messages.
# get_least_conn_server(self):
# Retrieves the least connected server from the backend.
# generate(self, query, user_input, context, stream, tools, images, model, temperature):
# Generates a response based on the provided query and options.
# make_summary(self, text):
# Generates a summary of the provided text.
# read_stream(self, response):
# Handles streaming responses.
# async_generate(self, query, user_input, context, stream, tools, images, model, temperature):
# Asynchronously generates a response based on the provided query and options.
# prepare_images(self, images, message):
# """
# def __init__(
# self,
# system_message: str = "You are an assistant.",
# temperature: float = 0.01,
# model: Optional[
# Literal["small", "standard", "vision", "reasoning", "tools"]
# ] = "standard",
# max_length_answer: int = 4096,
# messages: list[dict] = None,
# chat: bool = True,
# chosen_backend: str = None,
# tools: list = None,
# ) -> None:
# """
# Initialize the assistant with the given parameters.
# Args:
# system_message (str): The initial system message for the assistant. Defaults to "You are an assistant.".
# temperature (float): The temperature setting for the model, affecting randomness. Defaults to 0.01.
# model (Optional[Literal["small", "standard", "vision", "reasoning"]]): The model type to use. Defaults to "standard".
# max_length_answer (int): The maximum length of the generated answer. Defaults to 4096.
# messages (list[dict], optional): A list of initial messages. Defaults to None.
# chat (bool): Whether the assistant is in chat mode. Defaults to True.
# chosen_backend (str, optional): The backend server to use. If not provided, the least connected server is chosen.
# Returns:
# None
# """
# self.model = self.get_model(model)
# self.call_model = (
# self.model
# ) # This is set per call to decide what model that was actually used
# self.system_message = system_message
# self.options = {"temperature": temperature}
# self.messages = messages or [{"role": "system", "content": self.system_message}]
# self.max_length_answer = max_length_answer
# self.chat = chat
# if not chosen_backend:
# chosen_backend = self.get_least_conn_server()
# self.chosen_backend = chosen_backend
# headers = {
# "Authorization": f"Basic {self.get_credentials()}",
# "X-Chosen-Backend": self.chosen_backend,
# }
# self.host_url = os.getenv("LLM_API_URL").rstrip("/api/chat/")
# self.host_url = 'http://192.168.1.12:3300' #! Change back when possible
# self.client: Client = Client(host=self.host_url, headers=headers, timeout=240)
# self.async_client: AsyncClient = AsyncClient()
# def get_credentials(self):
# # Initialize the client with the host and default headers
# credentials = f"{os.getenv('LLM_API_USER')}:{os.getenv('LLM_API_PWD_LASSE')}"
# return base64.b64encode(credentials.encode()).decode()
# def get_model(self, model_alias):
# models = {
# "standard": "LLM_MODEL",
# "small": "LLM_MODEL_SMALL",
# "vision": "LLM_MODEL_VISION",
# "standard_64k": "LLM_MODEL_LARGE",
# "reasoning": "LLM_MODEL_REASONING",
# "tools": "LLM_MODEL_TOOLS",
# }
# model = os.getenv(models.get(model_alias, "LLM_MODEL"))
# self.model = model
# return model
# def count_tokens(self):
# num_tokens = 0
# for i in self.messages:
# for k, v in i.items():
# if k == "content":
# if not isinstance(v, str):
# v = str(v)
# tokens = tokenizer.encode(v)
# num_tokens += len(tokens)
# return int(num_tokens)
# def get_least_conn_server(self):
# try:
# response = requests.get("http://192.168.1.12:5000/least_conn")
# response.raise_for_status()
# # Extract the least connected server from the response
# least_conn_server = response.headers.get("X-Upstream-Address")
# return least_conn_server
# except requests.RequestException as e:
# print_red("Error getting least connected server:", e)
# return None
# def generate(
# self,
# query: str = None,
# user_input: str = None,
# context: str = None,
# stream: bool = False,
# tools: list = None,
# images: list = None,
# model: Optional[
# Literal["small", "standard", "vision", "reasoning", "tools"]
# ] = None,
# temperature: float = None,
# messages: list[dict] = None,
# format: BaseModel = None,
# think: bool = False
# ):
# """
# Generate a response based on the provided query and context.
# Parameters:
# query (str): The query string from the user.
# user_input (str): Additional user input to be appended to the last message.
# context (str): Contextual information to be used in generating the response.
# stream (bool): Whether to stream the response.
# tools (list): List of tools to be used in generating the response.
# images (list): List of images to be included in the response.
# model (Optional[Literal["small", "standard", "vision", "tools"]]): The model type to be used.
# temperature (float): The temperature setting for the model.
# messages (list[dict]): List of previous messages in the conversation.
# format (Optional[BaseModel]): The format of the response.
# think (bool): Whether to use the reasoning model.
# Returns:
# str: The generated response or an error message if an exception occurs.
# """
# # Prepare the model and temperature
# model = self.get_model(model) if model else self.model
# # if model == self.get_model('tools'):
# # stream = False
# temperature = temperature if temperature else self.options["temperature"]
# if messages:
# messages = [
# {"role": i["role"], "content": re.sub(r"\s*\n\s*", "\n", i["content"])}
# for i in messages
# ]
# message = messages.pop(-1)
# query = message["content"]
# self.messages = messages
# else:
# # Normalize whitespace and add the query to the messages
# query = re.sub(r"\s*\n\s*", "\n", query)
# message = {"role": "user", "content": query}
# # Handle images if any
# if images:
# message = self.prepare_images(images, message)
# model = self.get_model("vision")
# self.messages.append(message)
# # Prepare headers
# headers = {"Authorization": f"Basic {self.get_credentials()}"}
# if self.chosen_backend and model not in [self.get_model("vision"), self.get_model("tools"), self.get_model("reasoning")]: #TODO Maybe reasoning shouldn't be here.
# headers["X-Chosen-Backend"] = self.chosen_backend
# if model == self.get_model("small"):
# headers["X-Model-Type"] = "small"
# if model == self.get_model("tools"):
# headers["X-Model-Type"] = "tools"
# reasoning_models = ['qwen3', 'deepseek'] #TODO Add more reasoning models here when added to ollama
# if any([model_name in model for model_name in reasoning_models]):
# if think:
# self.messages[-1]['content'] = f"/think\n{self.messages[-1]['content']}"
# else:
# self.messages[-1]['content'] = f"/no_think\n{self.messages[-1]['content']}"
# # Prepare options
# options = Options(**self.options)
# options.temperature = temperature
# # Call the client.chat method
# try:
# self.call_model = model
# self.client: Client = Client(host=self.host_url, headers=headers, timeout=300) #!
# #print_rainbow(self.client._client.__dict__)
# print_yellow(f"🤖 Generating using {model}...")
# # if headers:
# # self.client.headers.update(headers)
# response = self.client.chat(
# model=model,
# messages=self.messages,
# tools=tools,
# stream=stream,
# options=options,
# keep_alive=3600 * 24 * 7,
# format=format
# )
# except ResponseError as e:
# print_red("Error!")
# print(e)
# return "An error occurred."
# # print_rainbow(response.__dict__)
# # If user_input is provided, update the last message
# if user_input:
# if context:
# if len(context) > 2000:
# context = self.make_summary(context)
# user_input = (
# f"{user_input}\n\nUse the information below to answer the question.\n"
# f'"""{context}"""\n[This is a summary of the context provided in the original message.]'
# )
# system_message_info = "\nSometimes some of the messages in the chat history are summarised, then that is clearly indicated in the message."
# if system_message_info not in self.messages[0]["content"]:
# self.messages[0]["content"] += system_message_info
# self.messages[-1] = {"role": "user", "content": user_input}
# # self.chosen_backend = self.client.last_response.headers.get("X-Chosen-Backend")
# # Handle streaming response
# if stream:
# print_purple("STREAMING")
# return self.read_stream(response)
# else:
# # Process the response
# if isinstance(response, ChatResponse):
# result = response.message.content.strip('"')
# if '</think>' in result:
# result = result.split('</think>')[-1]
# self.messages.append(
# {"role": "assistant", "content": result.strip('"')}
# )
# if tools and not response.message.get("tool_calls"):
# print_yellow("No tool calls in response".upper())
# if not self.chat:
# self.messages = [self.messages[0]]
# if not think:
# response.message.content = remove_thinking(response.message.content)
# return response.message
# else:
# print_red("Unexpected response type")
# return "An error occurred."
# def make_summary(self, text):
# # Implement your summary logic using self.client.chat()
# summary_message = {
# "role": "user",
# "content": f'Summarize the text below:\n"""{text}"""\nRemember to be concise and detailed. Answer in English.',
# }
# messages = [
# {
# "role": "system",
# "content": "You are summarizing a text. Make it detailed and concise. Answer ONLY with the summary. Don't add any new information.",
# },
# summary_message,
# ]
# try:
# response = self.client.chat(
# model=self.get_model("small"),
# messages=messages,
# options=Options(temperature=0.01),
# keep_alive=3600 * 24 * 7,
# )
# summary = response.message.content.strip()
# print_blue("Summary:", summary)
# return summary
# except ResponseError as e:
# print_red("Error generating summary:", e)
# return "Summary generation failed."
# def read_stream(self, response):
# """
# Yields tuples of (chunk_type, text). The first tuple is ('thinking', ...)
# if in_thinking is True and stops at </think>. After that, yields ('normal', ...)
# for the rest of the text.
# """
# thinking_buffer = ""
# in_thinking = self.call_model == self.get_model("reasoning")
# first_chunk = True
# prev_content = None
# for chunk in response:
# if not chunk:
# continue
# content = chunk.message.content
# # Remove leading quote if it's the first chunk
# if first_chunk and content.startswith('"'):
# content = content[1:]
# first_chunk = False
# if in_thinking:
# thinking_buffer += content
# if "</think>" in thinking_buffer:
# end_idx = thinking_buffer.index("</think>") + len("</think>")
# yield ("thinking", thinking_buffer[:end_idx])
# remaining = thinking_buffer[end_idx:].strip('"')
# if chunk.done and remaining:
# yield ("normal", remaining)
# break
# else:
# prev_content = remaining
# in_thinking = False
# else:
# if prev_content:
# yield ("normal", prev_content)
# prev_content = content
# if chunk.done:
# if prev_content and prev_content.endswith('"'):
# prev_content = prev_content[:-1]
# if prev_content:
# yield ("normal", prev_content)
# break
# self.messages.append({"role": "assistant", "content": ""})
# async def async_generate(
# self,
# query: str = None,
# user_input: str = None,
# context: str = None,
# stream: bool = False,
# tools: list = None,
# images: list = None,
# model: Optional[Literal["small", "standard", "vision"]] = None,
# temperature: float = None,
# ):
# """
# Asynchronously generates a response based on the provided query and other parameters.
# Args:
# query (str, optional): The query string to generate a response for.
# user_input (str, optional): Additional user input to be included in the response.
# context (str, optional): Context information to be used in generating the response.
# stream (bool, optional): Whether to stream the response. Defaults to False.
# tools (list, optional): List of tools to be used in generating the response. Will set the model to 'tools'.
# images (list, optional): List of images to be included in the response.
# model (Optional[Literal["small", "standard", "vision", "tools"]], optional): The model to be used for generating the response.
# temperature (float, optional): The temperature setting for the model.
# Returns:
# str: The generated response or an error message if an exception occurs.
# Raises:
# ResponseError: If an error occurs during the response generation.
# Notes:
# - The function prepares the model and temperature settings.
# - It normalizes whitespace in the query and handles images if provided.
# - It prepares headers and options for the request.
# - It adjusts options for long messages and calls the async client's chat method.
# - If user_input is provided, it updates the last message.
# - It updates the chosen backend based on the response headers.
# - It handles streaming responses and processes the response accordingly.
# - It's not neccecary to set model to 'tools' if you provide tools as an argument.
# """
# print_yellow("ASYNC GENERATE")
# # Normaliz e whitespace and add the query to the messages
# query = re.sub(r"\s*\n\s*", "\n", query)
# message = {"role": "user", "content": query}
# self.messages.append(message)
# # Prepare the model and temperature
# model = self.get_model(model) if model else self.model
# temperature = temperature if temperature else self.options["temperature"]
# # Prepare options
# options = Options(**self.options)
# options.temperature = temperature
# # Prepare headers
# headers = {}
# # Set model depending on the input
# if images:
# message = self.prepare_images(images, message)
# model = self.get_model("vision")
# elif tools:
# model = self.get_model("tools")
# headers["X-Model-Type"] = "tools"
# tools = [Tool(**tool) if isinstance(tool, dict) else tool for tool in tools]
# elif self.chosen_backend and model not in [self.get_model("vision"), self.get_model("tools"), self.get_model("reasoning")]:
# headers["X-Chosen-Backend"] = self.chosen_backend
# elif model == self.get_model("small"):
# headers["X-Model-Type"] = "small"
# # Adjust options for long messages
# if self.chat or len(self.messages) > 15000:
# num_tokens = self.count_tokens() + self.max_length_answer // 2
# if num_tokens > 8000 and model not in [
# self.get_model("vision"),
# self.get_model("tools"),
# ]:
# model = self.get_model("standard_64k")
# headers["X-Model-Type"] = "large"
# # Call the async client's chat method
# try:
# response = await self.async_client.chat(
# model=model,
# messages=self.messages,
# headers=headers,
# tools=tools,
# stream=stream,
# options=options,
# keep_alive=3600 * 24 * 7,
# )
# except ResponseError as e:
# print_red("Error!")
# print(e)
# return "An error occurred."
# # If user_input is provided, update the last message
# if user_input:
# if context:
# if len(context) > 2000:
# context = self.make_summary(context)
# user_input = (
# f"{user_input}\n\nUse the information below to answer the question.\n"
# f'"""{context}"""\n[This is a summary of the context provided in the original message.]'
# )
# system_message_info = "\nSometimes some of the messages in the chat history are summarised, then that is clearly indicated in the message."
# if system_message_info not in self.messages[0]["content"]:
# self.messages[0]["content"] += system_message_info
# self.messages[-1] = {"role": "user", "content": user_input}
# print_red(self.async_client.last_response.headers.get("X-Chosen-Backend", "No backend"))
# # Update chosen_backend
# if model not in [self.get_model("vision"), self.get_model("tools"), self.get_model("reasoning")]:
# self.chosen_backend = self.async_client.last_response.headers.get(
# "X-Chosen-Backend"
# )
# # Handle streaming response
# if stream:
# return self.read_stream(response)
# else:
# # Process the response
# if isinstance(response, ChatResponse):
# result = response.message.content.strip('"')
# self.messages.append(
# {"role": "assistant", "content": result.strip('"')}
# )
# if tools and not response.message.get("tool_calls"):
# print_yellow("No tool calls in response".upper())
# if not self.chat:
# self.messages = [self.messages[0]]
# return result
# else:
# print_red("Unexpected response type")
# return "An error occurred."
# def prepare_images(self, images, message):
# """
# Prepares a list of images by converting them to base64 encoded strings and adds them to the provided message dictionary.
# Args:
# images (list): A list of images, where each image can be a file path (str), a base64 encoded string (str), or bytes.
# message (dict): A dictionary to which the base64 encoded images will be added under the key "images".
# Returns:
# dict: The updated message dictionary with the base64 encoded images added under the key "images".
# Raises:
# ValueError: If an image is not a string or bytes.
# """
# import base64
# base64_images = []
# base64_pattern = re.compile(r"^[A-Za-z0-9+/]+={0,2}$")
# for image in images:
# if isinstance(image, str):
# if base64_pattern.match(image):
# base64_images.append(image)
# else:
# with open(image, "rb") as image_file:
# base64_images.append(
# base64.b64encode(image_file.read()).decode("utf-8")
# )
# elif isinstance(image, bytes):
# base64_images.append(base64.b64encode(image).decode("utf-8"))
# else:
# print_red("Invalid image type")
# message["images"] = base64_images
# # Use the vision model
# return message
# def remove_thinking(response):
# """Remove the thinking section from the response"""
# response_text = response.content if hasattr(response, "content") else str(response)
# if "</think>" in response_text:
# return response_text.split("</think>")[1].strip()
# return response_text
# if __name__ == "__main__":
# llm = LLM()
# result = llm.generate(
# query="I want to add 2 and 2",
# )
# print(result.content)

File diff suppressed because it is too large Load Diff

@ -18,13 +18,14 @@ import xml.etree.ElementTree as ET
from streamlit.runtime.uploaded_file_manager import UploadedFile
import streamlit as st
from _arango import ArangoDB
from _arango import ArangoDB, COLLECTIONS_IN_BASE
from _chromadb import ChromaDB
from _llm import LLM
from colorprinter.print_color import *
from utils import fix_key
from utils import fix_key, is_reference_chunk
import semantic_schoolar
from models import ArticleMetadataResponse
class Document:
def __init__(
@ -39,6 +40,7 @@ class Document:
_key: str = None,
arango_db_name: str = None,
arango_collection: str = None,
arango_doc: dict = None
):
self.filename = filename
self.pdf_file = pdf_file
@ -50,6 +52,7 @@ class Document:
self.arango_db_name = arango_db_name
self.arango_collection = arango_collection
self.text = text
self.arango_doc: dict = arango_doc
self.chunks = []
self.pdf = None
@ -61,6 +64,8 @@ class Document:
self.download_folder = None
self.document_type = None
if self._key:
self._key = fix_key(self._key)
if self.pdf_file:
self.open_pdf(self.pdf_file)
@ -71,9 +76,8 @@ class Document:
if not self._id:
return
data = {
"text": self.text,
"arango_doc": self.arango_doc,
"arango_db_name": self.arango_db_name,
"arango_id": self._id,
"is_sci": self.is_sci,
}
@ -132,7 +136,13 @@ class Document:
else:
better_chunks.append(chunk.strip())
self.chunks = better_chunks
# Check if the chunk is mainly academic references
chunks = []
for chunk in better_chunks:
if not is_reference_chunk(chunk):
self.chunks.append(chunk)
else:
print_yellow(f"Chunk is mainly academic references, skipping it.\n{chunk[:100]}...")
def get_title(self, only_meta=False):
"""
@ -238,7 +248,84 @@ class Document:
class Processor:
"""
Processor class for handling scientific and non-scientific document ingestion, metadata extraction, and storage.
This class provides a comprehensive pipeline for processing documents (primarily PDFs), extracting metadata (such as DOI, title, authors, journal, etc.), verifying and enriching metadata using external APIs (CrossRef, Semantic Scholar, DOAJ), chunking document text, and storing both the document and its chunks in vector and document databases (ChromaDB and ArangoDB).
Key Features:
-------------
- Extracts DOI from filenames and document text using regex and LLM fallback.
- Retrieves and verifies metadata from CrossRef, Semantic Scholar, and DOAJ.
- Handles both scientific articles and other document types, with appropriate collection routing.
- Chunks document text for vector storage and search.
- Stores documents and chunks in ArangoDB (document DB) and ChromaDB (vector DB).
- Manages user access and open access flags.
- Supports background summary generation for scientific articles.
- Provides PDF download utilities from open access sources.
- Designed for extensibility and robust error handling.
Parameters:
-----------
document : Document
The document object to be processed.
filename : str, optional
The filename of the document (default: None).
chroma_db : str, optional
Name of the ChromaDB database to use (default: "sci_articles").
len_chunks : int, optional
Length of text chunks for vector storage (default: 2200).
local_chroma_deployment : bool, optional
Whether to use a local ChromaDB deployment (default: False).
process : bool, optional
Whether to immediately process the document upon initialization (default: True).
document_type : str, optional
Type of the document for collection routing (default: None).
username : str, optional
Username for access control and database routing (default: None).
Methods:
get_arango(db_name=None, document_type=None)
extract_doi(text, multi=False)
Extract DOI(s) from text using regex and LLM fallback.
chunks2chroma(_id, key)
Add document chunks to ChromaDB vector database.
chunks2arango()
Add document chunks and metadata to ArangoDB document database.
llm2metadata()
Extract metadata from a scientific article using an LLM.
get_crossref(doi)
Retrieve and parse metadata from CrossRef by DOI.
check_doaj(doi)
Check if a DOI is listed in DOAJ and retrieve metadata.
get_semantic_scholar_by_doi(doi)
Retrieve and verify metadata from Semantic Scholar by DOI.
get_semantic_scholar_by_title(title)
Retrieve and verify metadata from Semantic Scholar by title.
process_document()
Main pipeline for processing, extracting, chunking, and storing the document.
dl_pyppeteer(doi, url)
Download a PDF using a headless browser (async).
doi2pdf(doi)
Download a PDF for a DOI from open access sources or retrieve from database.
Attributes:
-----------
document : Document
The document being processed.
chromadb : ChromaDB
The ChromaDB instance for vector storage.
len_chunks : int
Length of text chunks for vector storage.
document_type : str
Type of the document for collection routing.
filename : str
Filename of the document.
username : str
Username for access control and database routing.
_id : str
Internal document ID after processing.
Usage:
------
processor = Processor(document, filename="paper.pdf")
"""
def __init__(
self,
document: Document,
filename: str = None,
@ -249,6 +336,31 @@ class Processor:
document_type: str = None,
username: str = None,
):
"""
Initializes the class with the provided document and configuration parameters.
Args:
document (Document): The document object to be processed and stored.
filename (str, optional): The filename associated with the document. Defaults to None.
chroma_db (str, optional): The name of the ChromaDB database to use. Defaults to "sci_articles".
len_chunks (int, optional): The length of text chunks for processing. Defaults to 2200.
local_chroma_deployment (bool, optional): Whether to use a local ChromaDB deployment. Defaults to False.
process (bool, optional): Whether to process the document upon initialization. Defaults to True.
document_type (str, optional): The type/category of the document. Defaults to None.
username (str, optional): The username associated with the document. If not provided, uses document.username. Defaults to None.
Attributes:
document (Document): The document object.
chromadb (ChromaDB): The ChromaDB instance for database operations.
len_chunks (int): The length of text chunks for processing.
document_type (str): The type/category of the document.
filename (str): The filename associated with the document.
username (str): The username associated with the document.
_id: Internal identifier for the document.
Side Effects:
If process is True, calls self.process_document() to process the document.
"""
self.document = document
self.chromadb = ChromaDB(local_deployment=local_chroma_deployment, db=chroma_db)
self.len_chunks = len_chunks
@ -258,28 +370,47 @@ class Processor:
self.username = username if username else document.username
self._id = None
self._key = None
if process:
self.process_document()
def get_arango(self, db_name=None, document_type=None):
if db_name and document_type:
arango = ArangoDB(db_name=db_name)
arango_collection = arango.db.collection(document_type)
"""
Get an ArangoDB collection based on document type and context.
This method determines the appropriate ArangoDB collection to use based on the
document type and the document's properties.
Args:
db_name (str, optional): The name of the database to connect to.
Defaults to None, in which case the default database is used.
document_type (str, optional): The type of document, which maps to a collection name.
Defaults to None, in which case the method attempts to determine the appropriate collection.
Returns:
Collection: An ArangoDB collection object.
Raises:
AssertionError: If document_type is not provided for non-sci articles, or
if username is not provided for non-sci articles.
Notes:
- For document types in COLLECTIONS_IN_BASE, returns the corresponding collection.
- For scientific articles (document.is_sci == True), returns the "sci_articles" collection.
- For other documents, requires both document_type and document.username to be specified.
"""
if document_type in COLLECTIONS_IN_BASE:
return ArangoDB().get_collection(document_type)
elif self.document.is_sci:
arango = ArangoDB(db_name="base")
arango_collection = arango.db.collection("sci_articles")
elif self.document.open_access:
arango = ArangoDB(db_name="base")
arango_collection = arango.db.collection("other_documents")
return ArangoDB().get_collection("sci_articles")
else:
arango = ArangoDB(db_name=self.document.username)
arango_collection: ArangoCollection = arango.db.collection(
self.document_type
)
self.document.arango_db_name = arango.db.name
self.arango_collection = arango_collection
return arango_collection
assert document_type, "Document type must be provided for non-sci articles."
assert self.document.username, "Username must be provided for non-sci articles."
if self.document.username:
return ArangoDB(db_name=self.document.username).get_collection(document_type)
def extract_doi(self, text, multi=False):
"""
@ -360,7 +491,7 @@ class Processor:
ids.append(id)
metadata = {
"_key": id,
"_key": self.document._key,
"file": self.document.file_path,
"chunk_nr": i,
"pages": ",".join([str(i) for i in page_numbers]),
@ -378,6 +509,11 @@ class Processor:
"sci_articles"
)
else:
print('collection name'.upper(), f"{self.username}__other_documents")
print_yellow(self.chromadb.db.list_collections())
print(self.chromadb.db.database)
print('VERSION', self.chromadb.db.get_version)
print('CHROMA DB', self.chromadb.db)
chroma_collection = self.chromadb.db.get_or_create_collection(
f"{self.username}__other_documents"
)
@ -385,6 +521,31 @@ class Processor:
chroma_collection.add(ids=ids, documents=documents, metadatas=metadatas)
def chunks2arango(self):
"""
Adds document chunks to an ArangoDB database.
This method processes the document and its chunks to store them in the ArangoDB.
It handles scientific and non-scientific documents differently, applies access control,
and manages document metadata.
Prerequisites:
- Document must have a 'text' attribute
- Scientific documents must have 'doi' and 'metadata' attributes
- Non-scientific documents must have either '_key' attribute or DOI
The method:
1. Validates document attributes
2. Gets ArangoDB collection
3. Processes document chunks with page information
4. Manages user access permissions
5. Creates the ArangoDB document with all necessary fields
6. Handles special processing for scientific documents with abstracts
7. Inserts the document into ArangoDB with update capabilities
8. Initiates background summary generation if needed
Returns:
tuple: A tuple containing (document_id, document_key)
"""
st.write("Adding to document database...")
assert self.document.text, "Document must have 'text' attribute."
if self.document.is_sci:
@ -397,7 +558,7 @@ class Processor:
getattr(self.document, "_key", None) or self.document.doi
), "Document must have '_key' attribute or DOI."
arango_collection = self.get_arango()
arango_collection = self.get_arango(document_type=self.document.arango_collection)
if self.document.doi:
key = self.document.doi
@ -435,7 +596,7 @@ class Processor:
if self.document.open_access:
user_access = None
arango_document = {
self.document.arango_doc = {
"_key": fix_key(self.document._key),
"file": self.document.file_path,
"chunks": arango_chunks,
@ -446,6 +607,7 @@ class Processor:
"metadata": self.document.metadata,
"filename": self.document.filename,
}
print_purple('Number of chunks:', len(self.document.arango_doc['chunks']))
if self.document.metadata and self.document.is_sci:
if "abstract" in self.document.metadata:
@ -453,8 +615,8 @@ class Processor:
self.document.metadata["abstract"] = re.sub(
r"<[^>]*>", "", self.document.metadata["abstract"]
)
arango_document["metadata"] = self.document.metadata
arango_document["summary"] = {
self.document.arango_doc["metadata"] = self.document.metadata
self.document.arango_doc["summary"] = {
"text_sum": (
self.document.metadata["abstract"]["text_sum"]
if "text_sum" in self.document.metadata["abstract"]
@ -463,20 +625,49 @@ class Processor:
"meta": {"model": "from_metadata"},
}
arango_document["crossref"] = True
self.document.arango_doc["crossref"] = True
doc = arango_collection.insert(
arango_document, overwrite=True, overwrite_mode="update", keep_none=False
arango = ArangoDB(db_name=self.document.arango_db_name)
print_purple(self.document.arango_collection, self.document.arango_db_name)
inserted_document = arango.insert_document(
collection_name=self.document.arango_collection,
document=self.document.arango_doc,
overwrite=True,
overwrite_mode="update",
keep_none=False
)
self.document._id = doc["_id"]
print_green("ArangoDB document inserted:", inserted_document['_id'])
self.document.arango_doc = arango.db.collection(
self.document.arango_collection
).get(self.document._key)
self.document._id = self.document.arango_doc["_id"]
if "summary" not in arango_document:
if "summary" not in self.document.arango_doc:
# Make a summary in the background
print_yellow("No summary found in the document, generating in background...")
print_rainbow(self.document.arango_doc['chunks'])
self.document.make_summary_in_background()
return doc["_id"], key
else:
print_green("Summary already exists in the document.")
print(self.document.arango_doc['summary'])
return self.document.arango_doc
def llm2metadata(self):
"""
Extract metadata from a scientific article PDF using a LLM.
Uses the first page (or first two pages for multi-page documents) of the PDF
to extract the title, publication date, and journal name via LLM.
Returns:
dict: A dictionary containing the extracted metadata with the following keys:
- "title": The article title (str)
- "published_date": The publication date (str)
- "journal": The journal name (str)
- "published_year": The publication year (int or None if not parseable)
Note:
Default values are provided for any metadata that cannot be extracted.
The published_year is extracted from published_date when possible.
"""
st.write("Extracting metadata using LLM...")
llm = LLM(
temperature=0.01,
@ -499,38 +690,27 @@ class Processor:
"""
Answer ONLY with the information requested.
I want to know the published date on the form "YYYY-MM-DD".
I want the full title of the article.
I want the name of the journal/paper/outlet where the article was published.
Be sure to answer on the form "published_date;title;journal" as the answer will be used in a CSV.
If you can't find the information, answer "not_found".
'''
result = llm.generate(prompt)
print_blue(result)
if result == "not_found":
return None
else:
parts = result.content.split(";", 2)
if len(parts) != 3:
return None
published_date, title, journal = parts
if published_date == "not_found":
published_date = "[Unknown date]"
else:
try:
published_year = int(published_date.split("-")[0])
except:
published_year = None
if title == "not_found":
title = "[Unknown title]"
if journal == "not_found":
journal = "[Unknown publication]"
return {
"published_date": published_date,
"published_year": published_year,
"title": title,
"journal": journal,
}
result = llm.generate(prompt, format=ArticleMetadataResponse.model_json_schema())
structured_response = ArticleMetadataResponse.model_validate_json(result.content)
# Extract and process metadata with defaults and safer type conversion
metadata = {
"title": structured_response.title or "[Unknown title]",
"published_date": structured_response.published_date or "[Unknown date]",
"journal": structured_response.journal or "[Unknown publication]",
"published_year": None
}
# Parse year from date if available
if metadata["published_date"] and metadata["published_date"] != "[Unknown date]":
try:
metadata["published_year"] = int(metadata["published_date"].split("-")[0])
except (ValueError, IndexError):
pass
# Now you can use metadata dictionary instead of separate variables
return metadata
def get_crossref(self, doi):
try:
@ -903,7 +1083,7 @@ class Processor:
assert self.document.pdf_file or self.document.pdf, "PDF file must be provided."
if not self.document.pdf:
self.document.open_pdf(self.document.pdf_file)
if self.document.is_image:
return pymupdf4llm.to_markdown(
self.document.pdf, page_chunks=False, show_progress=False
@ -940,11 +1120,10 @@ class Processor:
if not self.document.metadata and self.document.title:
self.document.metadata = self.get_semantic_scholar_by_title(self.document.title)
# Continue with the rest of the method...
arango_collection = self.get_arango()
# ... rest of the method remains the same ...
if self.document.is_sci:
arango_collection = self.get_arango(document_type='sci_articles')
else:
arango_collection = self.get_arango(document_type='other_documents')
doc = arango_collection.get(self.document._key) if self.document.doi else None
@ -975,6 +1154,7 @@ class Processor:
arango_collection.update(self.document.doc)
return doc["_id"], arango_collection.db_name, self.document.doi
# If no document found, create a new one
else:
self.document.doc = (
{"doi": self.document.doi, "_key": fix_key(self.document.doi)}
@ -1021,7 +1201,8 @@ class Processor:
print_yellow(f"Document key: {_key}")
print(self.document.doi, self.document.title, self.document.get_title())
self.document.doc["_key"] = fix_key(_key)
self.document._key = fix_key(_key)
self.document._key = self.document.doc["_key"]
self.document.metadata = self.document.doc["metadata"]
if not self.document.text:
self.document.extract_text()
@ -1035,8 +1216,16 @@ class Processor:
self.document.make_chunks()
_id, key = self.chunks2arango()
self.chunks2chroma(_id=_id, key=key)
if not self.document.is_sci and not self.document.doi:
self.document.arango_collection = "other_documents"
self.document.arango_db_name = self.username
print_purple("Not a scientific article, using 'other_articles' collection.")
arango_doc = self.chunks2arango()
_id = arango_doc["_id"]
_key = arango_doc["_key"]
self.chunks2chroma(_id=_id, key=_key)
self._id = _id
return _id, arango_collection.db_name, self.document.doi
@ -1224,6 +1413,8 @@ class PDFProcessor(Processor):
return False, None, None, False
if __name__ == "__main__":
doi = "10.1007/s10584-019-02646-9"
print(f"Processing article with DOI: {doi}")

@ -190,3 +190,4 @@ country_emojis = {
"ro": "🇷🇴",
"rs": "🇷🇸",
}

@ -80,6 +80,11 @@ def create_plan(agent, question):
'''
The example above is just an example, you can use other steps and tasks that are more relevant for the question.
Again: The research will be done in a restricted context, with only the available sources and tools. Therefore:
- DO NOT include any steps that require access to the internet or external databases.
- DO NOT include any steps that require cross-referencing sources.
- DO NOT include any steps to find new sources or tools.
"""
return query

@ -1,26 +1,223 @@
from fastapi import FastAPI, BackgroundTasks, Request
from fastapi.responses import JSONResponse
from fastapi.responses import JSONResponse, HTMLResponse
import logging
from datetime import datetime
import json
import os
from typing import Dict, Any
from prompts import get_summary_prompt
from _llm import LLM
from _arango import ArangoDB
from models import ArticleChunk
from _chromadb import ChromaDB
app = FastAPI()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Storage for the latest processed document
latest_result: Dict[str, Any] = {}
latest_result_file = os.path.join(os.path.dirname(__file__), "latest_summary_result.json")
# Load any previously saved result on startup
try:
if os.path.exists(latest_result_file):
with open(latest_result_file, 'r') as f:
latest_result = json.load(f)
logger.info(f"Loaded previous result from {latest_result_file}")
except Exception as e:
logger.warning(f"Could not load previous result: {e}")
# Function to save the latest result to disk
def save_latest_result(result: Dict[str, Any]):
global latest_result
latest_result = result
try:
# Save sanitized version (remove internal fields if needed)
result_to_save = {k: v for k, v in result.items() if not k.startswith('_') or k == '_id'}
with open(latest_result_file, 'w') as f:
json.dump(result_to_save, f, indent=2)
logger.info(f"Saved latest result to {latest_result_file}")
except Exception as e:
logger.error(f"Error saving latest result: {e}")
# New endpoint to get the latest summarized document
@app.get("/latest_result")
async def get_latest_result():
"""
Get the latest summarized document result.
Returns the most recently processed document summary and chunk information.
If no document has been processed yet, returns an empty object.
Returns
-------
dict
The latest processed document with summaries
"""
if not latest_result:
return {"message": "No documents have been processed yet"}
return latest_result
@app.get("/view_results")
async def view_results():
"""
View the latest summarization results in a more readable format.
Returns a formatted response with document summary and chunks.
Returns
-------
dict
A formatted representation of the latest summarized document
"""
if not latest_result:
return {"message": "No documents have been processed yet"}
# Extract the key information
formatted_result = {
"document_id": latest_result.get("_id", "Unknown"),
"timestamp": datetime.now().isoformat(),
"summary": latest_result.get("summary", {}).get("text_sum", "No summary available"),
"model": latest_result.get("summary", {}).get("meta", {}).get("model", "Unknown model"),
}
# Format chunks information if available
chunks = latest_result.get("chunks", [])
if chunks:
formatted_chunks = []
for i, chunk in enumerate(chunks):
chunk_data = {
"chunk_number": i + 1,
"summary": chunk.get("summary", "No summary available"),
"tags": chunk.get("tags", [])
}
# Add references for scientific articles if available
if "references" in chunk:
chunk_data["references"] = chunk.get("references", [])
formatted_chunks.append(chunk_data)
formatted_result["chunks"] = formatted_chunks
formatted_result["chunk_count"] = len(chunks)
return formatted_result
@app.get("/html_results", response_class=HTMLResponse)
async def html_results():
"""
View the latest summarization results in a human-readable HTML format.
"""
if not latest_result:
return """
<html>
<head>
<title>No Results Available</title>
<style>
body { font-family: Arial, sans-serif; margin: 40px; line-height: 1.6; }
</style>
</head>
<body>
<h1>No Documents Have Been Processed Yet</h1>
<p>Submit a document for summarization first.</p>
</body>
</html>
"""
# Get the document ID and summary
doc_id = latest_result.get("_id", "Unknown")
summary = latest_result.get("summary", {}).get("text_sum", "No summary available")
model = latest_result.get("summary", {}).get("meta", {}).get("model", "Unknown model")
# Format chunks
chunks_html = ""
chunks = latest_result.get("chunks", [])
for i, chunk in enumerate(chunks):
chunk_summary = chunk.get("summary", "No summary available")
tags = chunk.get("tags", [])
tags_html = ", ".join(tags) if tags else "None"
references_html = ""
if "references" in chunk and chunk["references"]:
references_html = "<h4>References:</h4><ul>"
for ref in chunk["references"]:
references_html += f"<li>{ref}</li>"
references_html += "</ul>"
chunks_html += f"""
<div class="chunk">
<h3>Chunk {i+1}</h3>
<div class="chunk-summary">{chunk_summary}</div>
<div class="chunk-tags"><strong>Tags:</strong> {tags_html}</div>
{references_html}
</div>
<hr>
"""
html_content = f"""
<html>
<head>
<title>Document Summary: {doc_id}</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 40px; line-height: 1.6; max-width: 1000px; margin: 0 auto; padding: 20px; }}
h1, h2, h3 {{ color: #333; }}
.summary {{ background-color: #f9f9f9; padding: 15px; border-left: 4px solid #4CAF50; margin-bottom: 20px; }}
.chunk {{ background-color: #f5f5f5; padding: 15px; margin-bottom: 10px; border-radius: 4px; }}
.chunk-tags {{ margin-top: 10px; font-style: italic; }}
.metadata {{ color: #666; font-size: 0.9em; margin-bottom: 20px; }}
hr {{ border: 0; height: 1px; background: #ddd; margin: 20px 0; }}
.refresh-button {{ padding: 10px 15px; background-color: #4CAF50; color: white; border: none; cursor: pointer; border-radius: 4px; }}
.refresh-button:hover {{ background-color: #45a049; }}
</style>
</head>
<body>
<h1>Document Summary</h1>
<div class="metadata">
<strong>Document ID:</strong> {doc_id}<br>
<strong>Model:</strong> {model}<br>
<strong>Generated:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
</div>
<h2>Summary</h2>
<div class="summary">{summary}</div>
<h2>Chunks ({len(chunks)})</h2>
{chunks_html}
<button class="refresh-button" onclick="window.location.reload()">Refresh Results</button>
</body>
</html>
"""
return html_content
@app.post("/summarise_document")
async def summarize_document(request: Request, background_tasks: BackgroundTasks):
try:
data = await request.json()
logger.info(f"Received data: {data}")
# Clean the data
data['text'] = data.get('text', '').strip()
data['arango_db_name'] = data.get('arango_db_name', '').strip()
data['arango_id'] = data.get('arango_id', '').strip()
# Extract arango_id, checking both top-level field and inside arango_doc
arango_doc = data.get('arango_doc', {}) or {}
arango_id = arango_doc.get('_id', '')
arango_db_name = data.get('arango_db_name', '').strip()
if not arango_db_name:
return JSONResponse(
status_code=400,
content={"detail": "Missing required field: arango_db_name"},
)
print(arango_doc)
# Prepare data for processing
data['text'] = arango_doc.get('text', '').strip()
data['chunks'] = arango_doc.get('chunks', [])
data['arango_db_name'] = arango_db_name
data['arango_id'] = arango_id
data["arango_key"] = arango_doc['_key']
data['is_sci'] = data.get('is_sci', False)
background_tasks.add_task(summarise_document_task, data)
@ -29,45 +226,189 @@ async def summarize_document(request: Request, background_tasks: BackgroundTasks
logger.error(f"Error in summarize_document: {e}")
return JSONResponse(
status_code=500,
content={"detail": "An unexpected error occurred."},
content={"detail": f"An unexpected error occurred: {str(e)}"},
)
def summarise_document_task(doc_data: dict):
try:
_id = doc_data.get("arango_id")
text = doc_data.get("text")
# Get document ID and validate it
_id = doc_data.get("arango_id", "")
# Validate document ID - it should be in format "collection/key"
if not _id or '/' not in _id:
logger.error(f"Invalid document ID format: {_id}")
return
text = doc_data.get("text", "")
is_sci = doc_data.get("is_sci", False)
# Get collection name from document ID
collection = _id.split('/')[0]
if _id.split('/')[0] == 'interviews':
# Set appropriate system message based on document type
if collection == 'interviews':
system_message = "You are summarising interview transcripts. It is very important that you keep to what is written and do not add any of your own opinions or interpretations. Always answer in English."
elif is_sci or _id.split('/')[0] == 'sci_articles':
elif is_sci or collection == 'sci_articles':
system_message = "You are summarising scientific articles. It is very important that you keep to what is written and do not add any of your own opinions or interpretations. Always answer in English."
else:
system_message = "You are summarising a document. It is very important that you keep to what is written and do not add any of your own opinions or interpretations. Always answer in English."
# Initialize LLM and generate summary
llm = LLM(system_message=system_message)
#if 'abstract'
prompt = get_summary_prompt(text, is_sci)
summary = llm.generate(query=prompt)
response = llm.generate(query=prompt)
summary = response.content
# Create summary document
summary_doc = {
"text_sum": summary,
"meta": {
"model": llm.model,
"temperature": llm.options["temperature"],
"temperature": llm.options["temperature"] if text else 0,
},
}
arango = ArangoDB(db_name=doc_data.get("arango_db_name"))
# Process chunks if they exist
chunks = doc_data.get("chunks", [])
if chunks:
doc_data["chunks"] = summarise_chunks(chunks, is_sci=is_sci)
# Get database name and validate it
db_name = doc_data.get("arango_db_name")
if not db_name:
logger.error("Missing database name")
return
# Update document in ArangoDB
arango = ArangoDB(db_name=db_name)
arango.db.update_document(
{"summary": summary_doc, "_id": _id},
{"summary": summary_doc, "_id": _id, "chunks": doc_data["chunks"]},
silent=True,
check_rev=False,
)
# Update ChromaDB with the new summary
chroma = ChromaDB()
if db_name == "sci_articles":
chroma.add_document(
collection="sci_articles_article_summaries",
document_id= doc_data["_key"]
text=summary_doc["text_sum"],
metadata={
"model": summary_doc["meta"]["model"],
"date": datetime.now().strftime("%Y-%m-%d"),
"arango_id": _id,
"arango_db_name": db_name,
},
)
# Save the latest result
save_latest_result({"summary": summary_doc, "_id": _id, "chunks": doc_data["chunks"]})
logger.info(f"Successfully processed document {_id}")
except Exception as e:
logger.error(f'_id: _{id}')
# Log error with document ID if available
doc_id = doc_data.get("arango_id", "unknown")
logger.error(f'Error processing document ID: {doc_id}')
logger.error(f"Error in summarise_document_task: {e}")
def summarise_chunks(chunks: list, is_sci=False):
"""
Summarize chunks of text in a document using a language model.
For each chunk in the document that doesn't already have a summary, this function:
1. Generates a summary of the chunk text
2. Creates tags for the chunk
3. If is_sci=True, extracts scientific references from the chunk
Parameters
----------
chunks: list
A list of dictionaries representing chunks of text from a document.
Each chunk should have a "text" field containing the text to summarize.
is_sci : bool, default=False
If True, uses a scientific article summarization prompt and extracts references.
If False, uses a general article summarization prompt.
Returns
-------
list
A list of updated chunks containing summaries, tags, and metadata.
Raises
------
Exception
If there's an error processing a chunk.
Notes
-----
- Chunks that already have a "summary" field are skipped.
- The function uses an LLM instance with a system prompt tailored to the document type.
- The structured response is validated against the ArticleChunk model.
"""
if is_sci:
system_message = """You are a science assistant summarizing scientific articles.
You will get an article chunk by chunk, and you have three tasks for each chunk:
1. Summarize the content of the chunk.
2. Tag the chunk with relevant tags.
3. Extract the scientific references from the chunk.
"""
else:
system_message = """You are a general assistant summarizing articles.
You will get an article chunk by chunk, and you have two tasks for each chunk:
1. Summarize the content of the chunk.
2. Tag the chunk with relevant tags.
"""
system_message += """\nPlease make use of the previous chunks you have already seen to understand the current chunk in context and make the summary stand for itself. But remember, *it is the current chunk you are summarizing*
ONLY use the information in the chunks to make the summary, and do not add any information that is not in the chunks."""
llm = LLM(system_message=system_message)
new_chunks = []
for chunk in chunks:
if "summary" in chunk:
new_chunks.append(chunk)
continue
prompt = f"""Summarize the following text to make it stand on its own:\n
'''
{chunk['text']}
'''\n
Your tasks are:
1. Summarize the content of the chunk. Make sure to include all relevant details!
2. Tag the chunk with relevant tags.
"""
if is_sci:
prompt += "\n3. Extract the scientific references mentioned in this specific chunk. If there is a DOI reference, include that in the reference. Sometimes the reference is only a number in brackets, like [1], so make sure to include that as well (in brackets)."
prompt += "\nONLY use the information in the chunks to make the summary, and do not add any information that is not in the chunks."
try:
response = llm.generate(prompt, format=ArticleChunk.model_json_schema())
structured_response = ArticleChunk.model_validate_json(response.content)
chunk["summary"] = structured_response.summary
chunk["tags"] = [i.lower() for i in structured_response.tags]
# Add references for scientific articles if they exist in the response
if is_sci and hasattr(structured_response, 'references') and structured_response.references:
chunk["references"] = structured_response.references
chunk["summary_meta"] = {
"model": llm.model,
"date": datetime.now().strftime("%Y-%m-%d"),
}
except Exception as e:
logger.error(f"Error processing chunk: {e}")
# Continue processing other chunks even if one fails
chunk["summary"] = "Error processing chunk"
chunk["tags"] = []
new_chunks.append(chunk)
return new_chunks
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8100)

@ -54,16 +54,30 @@ def make_arango(username):
root_password = os.getenv("ARANGO_ROOT_PASSWORD")
arango = ArangoDB(user=root_user, password=root_password, db_name="_system")
# Create the user
if not arango.db.has_user(username):
user = arango.db.create_user(
username,
password=os.getenv("ARANGO_PASSWORD"),
active=True,
extra={},
)
else:
user = arango.db.user(username)
user['password'] = os.getenv("ARANGO_PASSWORD")
print_rainbow(user)
if not arango.db.has_database(username):
arango.db.create_database(
username,
name=user['username'],
users=[
user,
{
"username": os.getenv("ARANGO_USER"),
"password": os.getenv("ARANGO_PASSWORD"),
"active": True,
"extra": {},
}
},
],
)
arango = ArangoDB(user=root_user, password=root_password, db_name=username)
@ -145,7 +159,6 @@ def main():
yaml_file = "streamlit_users.yaml"
data = read_yaml(yaml_file)
if args.delete:
if args.user:
username = args.user

@ -0,0 +1,334 @@
from pydantic import BaseModel, Field
from typing import Dict, List, Tuple, Optional, Any
class ArticleChunk(BaseModel):
summary: str
tags: List[str]
references: Optional[List[str]]
class QueryResponse(BaseModel):
"""
Represents a query generated for retrieving documents from a vector database.
Attributes:
query (str): The generated query text, short and concise.
"""
query: str = Field(
description="The generated query that will be used to retrieve documents from a vector database (ChromaDB). Should be short and concise.",
example="capital of France",
)
class ArticleMetadataResponse(BaseModel):
"""
Represents structured metadata extracted from an article by an LLM.
"""
published_date: Optional[str] = Field(
description="The publication date of the article in YYYY-MM-DD format."
)
title: str = Field(
description="The full title of the article."
)
journal: Optional[str] = Field(
description="The name of the journal/paper/outlet where the article was published."
)
class PlanEvaluationResponse(BaseModel):
"""
Represents the evaluation of a plan's step.
Attributes:
reasoning (str): Explanation of the reasoning behind the evaluation.
complete (bool): Indicates if the step has sufficient information to proceed.
"""
reasoning: str = Field(
description="A short explanation of the reasoning behind the evaluation",
example="Although some information is missing, the existing data is sufficient to complete the step.",
)
complete: bool = Field(
description="Indicates whether the information is sufficient to complete the step",
example=False,
)
class EvaluateFormat(BaseModel):
"""
Represents the evaluation format for determining sufficiency of information.
Attributes:
explanation (str): Explanation of whether the information is sufficient.
status (bool): Indicates sufficiency of the information.
additional_info (Optional[str]): Additional information needed if insufficient.
"""
explanation: str = Field(
description="A very short explanation of whether the information is sufficient or not",
example="The information is sufficient because...",
)
status: bool = Field(
description="If the information is sufficient to complete the step or not.",
example=True,
)
additional_info: Optional[str] = Field(
description="If the information is not sufficient, what additional information would be needed",
example="We need more information about...",
)
class Plan(BaseModel):
"""
Represents a structured plan with steps and corresponding tasks or facts.
Attributes:
steps (Dict[str, List[Tuple[str, str]]]): A dictionary where keys are step names and values are lists of tasks or facts.
"""
steps: Dict[str, List[Tuple[str, str]]] = Field(
description="Structured plan represented as steps with their corresponding tasks or facts",
example={
"Step 1: Gather Existing Materials": [
("Task 1", "Description of task"),
("Task 2", "Description of task"),
],
"Step 2: Extract Relevant Information": [
("Task 1", "Description of task"),
("Task 2", "Description of task"),
],
},
)
class ChunkMetadata(BaseModel):
"""
Metadata associated with a document chunk.
Attributes:
title (str): Title of the document chunk.
journal (Optional[str]): Journal where the document was published.
published_date (Optional[str]): Date of publication.
user_notes (Optional[str]): User-provided notes.
arango_id (Optional[str]): Unique identifier for the document in ArangoDB.
additional_metadata (Dict[str, Any]): Any additional metadata fields.
doi (Optional[str]): Digital Object Identifier for the document.
link: (Optional[str]): URL to access the document.
authors (Optional[List[str]]): List of authors of the document.
published_year (Optional[int]): Year of publication.
abstract: (Optional[str]): Abstract of the document.
pages: (Optional[str]): Page numbers of the document.
chroma_id (Optional[str]): Unique identifier for the chunk in ChromaDB.
"""
title: str = Field(default="No title", description="Title of the document chunk.")
journal: Optional[str] = None
published_date: Optional[str] = None
user_notes: Optional[str] = None
_id: Optional[str] = None
additional_metadata: Dict[str, Any] = Field(default_factory=dict)
doi: Optional[str] = None
link: Optional[str] = None
authors: Optional[List[str]] = Field(
default_factory=list,
description="List of authors of the document.",
)
published_year: Optional[int] = Field(
default=None,
description="Year of publication.",
)
abstract: Optional[str] = Field(
default=None,
description="Abstract of the document.",
)
pages: Optional[str] = Field(
default=None,
description="Page numbers of the document.",
)
chroma_id: Optional[str] = Field(
default=None,
description="Unique identifier for the chunk in ChromaDB.",
)
class DocumentChunk(BaseModel):
"""
Represents a chunk of text from a document with its metadata.
Attributes:
document (str): The text content of the chunk.
metadata (ChunkMetadata): Metadata associated with the chunk.
"""
document: str
metadata: ChunkMetadata
class UnifiedDataChunk(BaseModel):
"""
Represents a unified chunk of data from any source.
Attributes:
content (str): The main content of the chunk (e.g., text, note, or document).
metadata (Optional[Dict[str, Any]]): Metadata associated with the chunk.
source_type (str): The type of source (e.g., 'note', 'article', 'document').
"""
content: str = Field(
description="The main content of the chunk (e.g., text, note, or document)."
)
metadata: Optional[ChunkMetadata] = Field(
description="Metadata associated with the chunk (e.g., title, source, date).",
)
source_type: str = Field(
description="The type of source (e.g., 'note', 'article', 'document')."
)
class UnifiedSearchResults(BaseModel):
"""
Represents unified search results from any search tool.
Attributes:
chunks (List[UnifiedDataChunk]): List of data chunks from the search.
source_ids (List[str]): List of unique source IDs for the chunks.
"""
chunks: List[UnifiedDataChunk] = Field(
description="List of data chunks from the search."
)
source_ids: List[str] = Field(
default_factory=list, description="List of unique source IDs for the chunks."
)
class UnifiedToolResponse(BaseModel):
"""
Represents a unified response from any tool.
Attributes:
search_results (Optional[UnifiedSearchResults]): The unified search results, if the tool used is returning search results.
text_result (Optional[str]): Text result from the tool, e.g., if the tool is an analysis.
tool_name (str): The name of the tool used to generate the response.
"""
search_results: Optional[UnifiedSearchResults] = Field(
default=None,
description="The unified search results, if the tools used is returning search results.",
)
text_results: Optional[list[str]] = Field(
default=None,
description="Text results from the tool, e.g., if the tool is an analysis.",
)
tool_names: Optional[list[str]] = Field(
default=None, description="The name of the tool used to generate the response."
)
def extend_search_results(self, search_results: UnifiedSearchResults) -> None:
"""
Extends the search results with additional data.
Args:
search_results (UnifiedSearchResults): The new search results to extend.
"""
if self.search_results is None:
self.search_results = search_results
else:
self.search_results.chunks.extend(search_results.chunks)
self.search_results.source_ids.extend(search_results.source_ids)
def extend_text_results(self, text_result: str) -> None:
"""
Extends the text result with additional data.
Args:
text_result (str): The new text result to extend.
"""
if self.text_results is None:
self.text_results = [text_result]
else:
self.text_results.append(text_result)
def extend_tool_name(self, tool_name: str) -> None:
"""
Extends the tool name with additional data.
Args:
tool_name (str): The new tool name to extend.
"""
if self.tool_names is None:
self.tool_names = [tool_name]
else:
self.tool_names.append(tool_name)
@property
def to_text(self) -> str:
"""
Generates formatted text from search results or returns the text result.
If search_results exists, formats content from each chunk along with its source.
Otherwise, returns the text_result if available.
Returns:
str: The formatted text from search results or the text result.
Raises:
ValueError: If neither search_results nor text_results are available.
"""
if self.search_results and self.search_results.chunks:
formatted_chunks = []
for i, chunk in enumerate(self.search_results.chunks):
# Handle UnifiedDataChunk structure
content = chunk.content
metadata = chunk.metadata or {}
source_info = f"Source: {metadata.title}"
if metadata.journal:
source_info += f" - {metadata.journal}"
if metadata.published_date:
source_info += f" ({metadata.published_date})"
# Format the chunk with its content and source
formatted_chunk = f"### Chunk {i+1}\n{content}\n\n*{source_info}*\n"
formatted_chunks.append(formatted_chunk)
return "\n---\n".join(formatted_chunks)
elif self.text_results:
return '\n---\n'.join(self.text_results)
else:
return "No search results or text results available."
@property
def get_chroma_ids(self) -> List[str]:
"""
Returns the list of Chroma IDs from the search results.
Returns:
List[str]: The list of Chroma IDs.
"""
if self.search_results and self.search_results.source_ids:
return self.search_results.source_ids
return []
class ChunkSearchResults(BaseModel):
"""
Represents the results of a search query across document collections.
Attributes:
chunks (List[DocumentChunk]): List of document chunks containing text and metadata.
chroma_ids (List[str]): List of Chroma IDs for the chunks.
arango_ids (List[str]): List of ArangoDB IDs for the related documents.
"""
chunks: List[UnifiedDataChunk] = Field(
description="List of document chunks containing text, metadata, and relevance scores."
)
chroma_ids: List[str] = Field(
default_factory=list, description="List of Chroma IDs for the chunks"
)
arango_ids: List[str] = Field(
default_factory=list,
description="List of ArangoDB IDs for the related documents",
)

@ -1,6 +0,0 @@
from pydantic import BaseModel
class QueryResponse(BaseModel):
query_to_vector_database: str
short_explanation: str

@ -42,10 +42,8 @@ class ProjectsPage(StreamlitBaseClass):
self.update_session_state(self.page_name)
def load_projects(self):
projects_cursor = self.user_arango.db.aql.execute(
"FOR doc IN projects RETURN doc", count=True
)
self.projects = list(projects_cursor)
# Get projects using the new API method
self.projects = self.user_arango.get_projects(username=self.username)
def display_projects(self):
with st.sidebar:
@ -53,7 +51,7 @@ class ProjectsPage(StreamlitBaseClass):
projects = [proj["name"] for proj in self.projects]
self.selected_project_name = st.selectbox(
"Select a project to manage",
options=[proj["name"] for proj in self.projects],
options=projects,
index=projects.index(self.project) if self.project in projects else None,
)
if self.selected_project_name:
@ -83,16 +81,16 @@ class ProjectsPage(StreamlitBaseClass):
)
if st.button("Create Project"):
if new_project_name:
self.user_arango.db.collection("projects").insert(
{
"name": new_project_name,
"description": new_project_description,
"collections": [],
"notes": [],
"note_keys_hash": hash(""),
"settings": {},
}
)
# Use the API to create a new project
self.user_arango.create_project({
"name": new_project_name,
"description": new_project_description,
"username": self.username,
"collections": [],
"notes": [],
"note_keys_hash": hash(""),
"settings": {},
})
st.success(f'New project "{new_project_name}" created')
st.session_state["new_project"] = False
self.update_settings("current_project", new_project_name)
@ -105,11 +103,11 @@ class ProjectsPage(StreamlitBaseClass):
st.markdown(self.project.notes_summary)
with st.expander("Show project notes"):
notes_cursor = self.user_arango.db.aql.execute(
"FOR doc IN notes FILTER doc._id IN @note_ids RETURN doc",
bind_vars={"note_ids": self.project.notes},
# Use the API to get project notes
notes = self.user_arango.get_project_notes(
project_name=self.project.name,
username=self.username
)
notes = list(notes_cursor)
if notes:
for note in notes:
st.markdown(f'_{note.get("timestamp", "")}_')
@ -126,21 +124,29 @@ class ProjectsPage(StreamlitBaseClass):
def show_project_interviews(self):
with st.expander("Show project interviews"):
if not self.user_arango.db.has_collection("interviews"):
self.user_arango.db.create_collection("interviews")
interviews_cursor = self.user_arango.db.aql.execute(
"FOR doc IN interviews FILTER doc.project == @project_name RETURN doc",
bind_vars={"project_name": self.project.name},
# Use the API to create collection if it doesn't exist
if not self.user_arango.has_collection("interviews"):
self.user_arango.create_collection("interviews")
# Use the API to get interviews for this project
interviews = self.user_arango.execute_aql(
"""
FOR doc IN interviews
FILTER doc.project == @project_name
RETURN doc
""",
bind_vars={"project_name": self.project.name}
)
interviews = list(interviews_cursor)
if interviews:
for interview in interviews:
interviews_list = list(interviews)
if interviews_list:
for interview in interviews_list:
st.markdown(f'_{interview.get("timestamp", "")}_')
if interview['intervievees']:
if interview.get('intervievees'):
st.markdown(
f"**Interviewees:** {', '.join(interview['intervievees'])}"
)
if interview['interviewer']:
if interview.get('interviewer'):
st.markdown(f"**Interviewer:** {interview['interviewer']}")
if len(interview["transcript"].split("\n")) > 6:
preview = (
@ -186,8 +192,10 @@ class ProjectsPage(StreamlitBaseClass):
self.sidebar_actions()
self.project.update_notes_hash()
if st.button(f":red[Remove project *{self.project.name}*]"):
self.user_arango.db.collection("projects").delete_match(
{"name": self.project.name}
# Use the API to delete the project
self.user_arango.delete_project(
project_name=self.project.name,
username=self.username
)
self.update_settings("current_project", None)
st.success(f'Project "{self.project.name}" removed')
@ -196,12 +204,13 @@ class ProjectsPage(StreamlitBaseClass):
self.update_session_state(self.page_name)
def relate_collections(self):
collections = [
col["name"]
for col in self.user_arango.db.collection("article_collections").all()
]
# Get all collections using the API
collections = self.user_arango.execute_aql(
"FOR c IN article_collections RETURN c.name"
)
collections_list = list(collections)
selected_collections = st.multiselect(
"Relate existing collections", options=collections
"Relate existing collections", options=collections_list
)
if st.button("Relate Collections"):
self.project.add_collections(selected_collections)
@ -214,8 +223,10 @@ class ProjectsPage(StreamlitBaseClass):
)
if st.button("Create and Relate Collection"):
if new_collection_name:
self.user_arango.db.collection("article_collections").insert(
{"name": new_collection_name, "articles": []}
# Use the API to insert a new collection
self.user_arango.insert_document(
collection_name="article_collections",
document={"name": new_collection_name, "articles": []}
)
self.project.add_collection(new_collection_name)
st.success(
@ -248,6 +259,7 @@ class ProjectsPage(StreamlitBaseClass):
def upload_notes_form(self):
with st.expander("Upload notes"):
with st.form("add_notes", clear_on_submit=True):
files = st.file_uploader(
"Upload PDF or image",
@ -338,51 +350,6 @@ class Project(StreamlitBaseClass):
A dictionary of settings for the project.
notes_summary : str
A summary of the notes in the project.
Methods:
--------
load_project():
Loads the project data from the ArangoDB.
update_project():
Updates the project data in the ArangoDB.
add_collections(collections):
Adds multiple collections to the project.
add_collection(collection_name):
Adds a single collection to the project.
add_note(note):
Adds a note to the project.
add_interview(interview, intervievees, interviewer, date_of_interveiw):
Adds an interview to the project.
add_interview_transcript(transcript, filename, intervievees, interviewer, date_of_interveiw):
Adds an interview transcript to the project.
transcribe(uploaded_file):
Transcribes an uploaded audio file.
format_transcription(transcription):
Formats the transcription text.
delete_note(note_id):
Deletes a note from the project.
delete_interview(interview_id):
Deletes an interview from the project.
update_notes_hash():
Updates the hash value of the notes.
make_project_notes_hash():
Generates a hash value for the project notes.
create_notes_summary():
Creates a summary of the project notes.
analyze_image(image_base64, text):
Analyzes an image and generates a description.
process_uploaded_notes(files):
Processes uploaded note files.
file2img(file):
Converts an uploaded file to an image.
convert_image_to_pdf(img):
Converts an image to a PDF file.
get_wikipedia_data(page_url):
Fetches data from a Wikipedia page.
process_wikipedia_data(wiki_data, wiki_url):
Processes Wikipedia data and adds it to the project.
process_dois(article_collection_name, text, dois):
Processes DOIs and adds the corresponding articles to the project.
"""
def __init__(self, username: str, project_name: str, user_arango: ArangoDB):
super().__init__(username=username)
@ -394,6 +361,7 @@ class Project(StreamlitBaseClass):
self.note_keys_hash = 0
self.settings = {}
self.notes_summary = ""
self._key = None
# Initialize attributes from arango doc if available
self.load_project()
@ -401,14 +369,15 @@ class Project(StreamlitBaseClass):
def load_project(self):
print_blue("Project name:", self.name)
project_cursor = self.user_arango.db.aql.execute(
"FOR doc IN projects FILTER doc.name == @name RETURN doc",
bind_vars={"name": self.name},
# Use the API to get project details
project = self.user_arango.get_project(
project_name=self.name,
username=self.username
)
project = next(project_cursor, None)
if not project:
raise ValueError(f"Project '{self.name}' not found.")
self._key = project["_key"]
self.name = project.get("name", "")
self.description = project.get("description", "")
@ -418,9 +387,10 @@ class Project(StreamlitBaseClass):
self.settings = project.get("settings", {})
self.notes_summary = project.get("notes_summary", "")
def update_project(self):
# Use the API to update project details
updated_doc = {
"_id": f"projects/{self._key}",
"_key": self._key,
"name": self.name,
"description": self.description,
@ -429,8 +399,9 @@ class Project(StreamlitBaseClass):
"note_keys_hash": self.note_keys_hash,
"settings": self.settings,
"notes_summary": self.notes_summary,
"username": self.username
}
self.user_arango.db.collection("projects").update(updated_doc, check_rev=False)
self.user_arango.update_project(updated_doc)
self.update_session_state()
def add_collections(self, collections):
@ -448,7 +419,13 @@ class Project(StreamlitBaseClass):
note["text"] = note["text"].strip().strip("\n")
if "timestamp" not in note:
note["timestamp"] = datetime.now().strftime("%Y-%m-%d %H:%M")
note_doc = self.user_arango.db.collection("notes").insert(note)
# Use the API to add a note to the project
note["project"] = self.name
note["username"] = self.username
note_doc = self.user_arango.add_note_to_project(note)
if note_doc["_id"] not in self.notes:
self.notes.append(note_doc["_id"])
self.update_project()
@ -534,8 +511,11 @@ class Project(StreamlitBaseClass):
]
if not interviewer:
interviewer = self.username
if not self.user_arango.db.has_collection("interviews"):
self.user_arango.db.create_collection("interviews")
# Ensure interviews collection exists using the API
if not self.user_arango.has_collection("interviews"):
self.user_arango.create_collection("interviews")
if isinstance(date_of_interveiw, str):
date_of_interveiw = datetime.strptime(date_of_interveiw, "%Y-%m-%d")
@ -553,8 +533,10 @@ class Project(StreamlitBaseClass):
document.make_chunks(len_chunks=600)
self.user_arango.db.collection("interviews").insert(
{
# Use the API to insert the interview document
self.user_arango.insert_document(
collection_name="interviews",
document={
"_key": _key,
"transcript": transcript,
"project": self.name,
@ -562,11 +544,10 @@ class Project(StreamlitBaseClass):
"timestamp": timestamp,
"intervievees": intervievees,
"interviewer": interviewer,
"date_of_interveiw": date_of_interveiw,
"date_of_interveiw": date_of_interveiw.isoformat() if date_of_interveiw else None,
"chunks": document.chunks,
},
overwrite=True,
silent=True,
overwrite=True
)
document.make_summary_in_background()
@ -668,11 +649,18 @@ class Project(StreamlitBaseClass):
def delete_note(self, note_id):
if note_id in self.notes:
self.notes.remove(note_id)
# Delete the note document using the API
self.user_arango.delete_document(
collection_name="notes",
document_key=note_id.split("/")[1]
)
self.update_project()
def delete_interview(self, interview_id):
self.user_arango.db.collection("interviews").delete_match(
{"_key": interview_id}
# Delete interview using the API
self.user_arango.delete_document(
collection_name="interviews",
document_key=interview_id
)
def update_notes_hash(self):
@ -690,12 +678,14 @@ class Project(StreamlitBaseClass):
return hash(note_keys_str)
def create_notes_summary(self):
notes_cursor = self.user_arango.db.aql.execute(
"FOR doc IN notes FILTER doc._id IN @note_ids RETURN doc.text",
bind_vars={"note_ids": self.notes},
)
notes = list(notes_cursor)
notes_string = "\n---\n".join(notes)
# Get note texts using the API
notes_list = []
for note_id in self.notes:
note = self.user_arango.get_document(note_id)
if note and "text" in note:
notes_list.append(note["text"])
notes_string = "\n---\n".join(notes_list)
llm = LLM(model="small")
query = get_note_summary_prompt(self, notes_string)
summary = llm.generate(query).content
@ -799,8 +789,17 @@ class Project(StreamlitBaseClass):
)
wiki_data.pop("summary", None)
wiki_data.pop("content", None)
self.user_arango.db.collection("notes").insert(
wiki_data, overwrite=True, silent=True
# Use the API to insert wiki data as a note
self.user_arango.insert_document(
collection_name="notes",
document={
**wiki_data,
"project": self.name,
"username": self.username,
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M")
},
overwrite=True
)
self.add_note(wiki_data)
@ -846,3 +845,29 @@ class Project(StreamlitBaseClass):
_id=f"sci_articles/{fix_key(doi)}",
)
self.update_session_state()
def articles2collection(self, collection, db, _id):
# Use the base/admin ArangoDB for general operations like adding to collections
base_arango = ArangoDB(db_name="base")
# Get the collection
collection_doc = base_arango.execute_aql(
"FOR c IN article_collections FILTER c.name == @name RETURN c",
bind_vars={"name": collection}
)
try:
collection_doc = next(collection_doc)
if _id not in collection_doc["articles"]:
collection_doc["articles"].append(_id)
# Update the collection
base_arango.update_document(collection_doc)
except StopIteration:
# Collection doesn't exist, create it
base_arango.insert_document(
collection_name="article_collections",
document={
"name": collection,
"articles": [_id]
}
)

@ -4,7 +4,13 @@ from colorprinter.print_color import *
from _base_class import StreamlitBaseClass
from projects_page import Project
from agent_research import ResearchReport, MasterAgent, StructureAgent, ToolAgent, ArchiveAgent, process_step
from agent_research import (
ResearchReport,
MasterAgent,
StructureAgent,
ToolAgent,
ArchiveAgent,
)
import os
import json
@ -12,11 +18,11 @@ import json
class ResearchPage(StreamlitBaseClass):
"""
ResearchPage - A Streamlit interface for deep research using AI agents.
This class provides a user interface for conducting in-depth research using
multiple specialized AI agents working together. It allows users to input
research questions, track progress, and view detailed research reports.
Attributes:
username (str): The username of the current user.
project_name (str): Name of the selected project.
@ -24,7 +30,7 @@ class ResearchPage(StreamlitBaseClass):
page_name (str): Name of the current page ("Research").
research_state (dict): Dictionary tracking the current state of research.
report (ResearchReport): Instance for tracking research progress and results.
Methods:
run(): Main method to render the research interface and handle interactions.
sidebar_actions(): Renders sidebar elements for selecting projects and research options.
@ -33,12 +39,13 @@ class ResearchPage(StreamlitBaseClass):
display_report(): Renders a research report in the Streamlit interface.
show_research_progress(): Displays the current research progress.
"""
def __init__(self, username):
super().__init__(username=username)
self.project_name = None
self.project = None
self.page_name = "Research"
# Research state tracking
self.research_state = {
"in_progress": False,
@ -48,29 +55,29 @@ class ResearchPage(StreamlitBaseClass):
"report": None,
"current_step": None,
"steps_completed": 0,
"total_steps": 0
"total_steps": 0,
}
self.report = None
# Initialize attributes from session state if available
if self.page_name in st.session_state:
for k, v in st.session_state[self.page_name].items():
setattr(self, k, v)
# Create reports directory if it doesn't exist
os.makedirs(f"/home/lasse/sci/reports", exist_ok=True)
def run(self):
self.update_current_page("Research")
self.sidebar_actions()
st.title("Deep Research")
if not self.project:
st.warning("Please select a project to start researching.")
return
# Main interface
if self.research_state["in_progress"]:
self.show_research_progress()
@ -80,24 +87,26 @@ class ResearchPage(StreamlitBaseClass):
# Input for new research
st.subheader(f"New Research for Project: {self.project_name}")
with st.form("research_form"):
question = st.text_area("Enter your research question:",
help="Be specific about what you want to research. Complex questions will be broken down into sub-questions.")
question = st.text_area(
"Enter your research question:",
help="Be specific about what you want to research. Complex questions will be broken down into sub-questions.",
)
start_button = st.form_submit_button("Start Research")
if start_button and question:
self.start_new_research(question)
st.rerun()
# Option to view saved reports
with st.expander("View Saved Reports"):
self.view_saved_reports()
def sidebar_actions(self):
with st.sidebar:
with st.form("select_project"):
self.project = self.choose_project("Project for research:")
submitted = st.form_submit_button("Select Project")
if submitted and self.project:
self.project_name = self.project.name
st.success(f"Selected project: {self.project_name}")
@ -107,7 +116,7 @@ class ResearchPage(StreamlitBaseClass):
if st.button("Cancel Research"):
self.research_state["in_progress"] = False
st.rerun()
elif self.research_state["completed"]:
if st.button("Start New Research"):
self.research_state["completed"] = False
@ -120,22 +129,20 @@ class ResearchPage(StreamlitBaseClass):
self.research_state["in_progress"] = True
self.research_state["completed"] = False
self.research_state["started_at"] = datetime.now().isoformat()
# Initialize the research report
self.report = ResearchReport(
question=question,
username=self.username,
project_name=self.project_name
question=question, username=self.username, project_name=self.project_name
)
# Save current state
st.session_state[self.page_name] = {
"project_name": self.project_name,
"project": self.project,
"research_state": self.research_state,
"report": self.report
"report": self.report,
}
# Start a new thread to run the research process
# In a production environment, you might want to use a background job
# For now, we'll run it in the main thread with streamlit spinner
@ -143,15 +150,13 @@ class ResearchPage(StreamlitBaseClass):
try:
# Initialize agents
master_agent = MasterAgent(
username=self.username,
project=self.project,
report=self.report,
chat=True
username=self.username,
project=self.project,
report=self.report,
chat=True,
)
structure_agent = StructureAgent(
username=self.username,
model="small",
report=self.report
username=self.username, model="small", report=self.report
)
tool_agent = ToolAgent(
username=self.username,
@ -159,78 +164,78 @@ class ResearchPage(StreamlitBaseClass):
system_message="You are an assistant with tools. Always choose a tool to help with the task.",
report=self.report,
project=self.project,
chat=True
chat=True,
)
archive_agent = ArchiveAgent(
username=self.username,
report=self.report,
project=self.project,
system_message="You are an assistant specialized in reading and summarizing research information.",
chat=True
chat=True,
)
# Track the research state in the master agent
master_agent.research_state["original_question"] = question
# Execute the research workflow
# 1. Create research plan
st.text("Creating research plan...")
research_plan = master_agent.make_plan(question)
self.report.log_plan(research_plan)
# 2. Structure the plan
st.text("Structuring research plan...")
structured_plan = structure_agent.make_structured(research_plan, question)
structured_plan = structure_agent.make_structured(
research_plan, question
)
self.report.log_plan(research_plan, structured_plan.model_dump())
# Update total steps count
self.research_state["total_steps"] = len(structured_plan.steps)
# 3. Execute the plan step by step
execution_results = {}
for step_name, tasks in structured_plan.steps.items():
st.text(f"Processing step: {step_name}")
self.research_state["current_step"] = step_name
self.research_state["steps_completed"] += 1
# Collect all task descriptions in this step
step_tasks = [
{"task_name": task_name, "task_description": task_description}
for task_name, task_description in tasks
]
# Process the entire step
step_result = process_step(
step_name, step_tasks, master_agent, tool_agent, archive_agent
)
step_result = master_agent.process_step(step_name, step_tasks)
execution_results[step_name] = step_result
# 4. Evaluate if more steps are needed
st.text("Evaluating research plan...")
plan_evaluation = master_agent.evaluate_plan(execution_results)
self.report.log_plan_evaluation(plan_evaluation)
# 5. Write the final report
st.text("Writing final report...")
final_report = master_agent.write_report(execution_results)
self.report.log_final_report(final_report)
# 6. Save the reports
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
report_path = f"/home/lasse/sci/reports/research_report_{self.username}_{timestamp}"
# Save JSON report
json_path = f"{report_path}.json"
with open(json_path, "w") as f:
json.dump(self.report.get_full_report(), f, indent=2)
# Save markdown report
markdown_report = self.report.get_markdown_report()
markdown_path = f"{report_path}.md"
with open(markdown_path, "w") as f:
f.write(markdown_report)
# Update research state
self.research_state["in_progress"] = False
self.research_state["completed"] = True
@ -238,21 +243,22 @@ class ResearchPage(StreamlitBaseClass):
"json_path": json_path,
"markdown_path": markdown_path,
"report_data": self.report.get_full_report(),
"markdown_content": markdown_report
"markdown_content": markdown_report,
}
except Exception as e:
st.error(f"An error occurred during research: {str(e)}")
import traceback
st.code(traceback.format_exc())
self.research_state["in_progress"] = False
# Update session state
st.session_state[self.page_name] = {
"project_name": self.project_name,
"project": self.project,
"research_state": self.research_state,
"report": self.report
"report": self.report,
}
def view_saved_reports(self):
@ -261,58 +267,68 @@ class ResearchPage(StreamlitBaseClass):
if not os.path.exists(reports_dir):
st.info("No saved reports found.")
return
# Get all report files
json_files = [f for f in os.listdir(reports_dir) if f.endswith('.json') and f.startswith('research_report')]
json_files = [
f
for f in os.listdir(reports_dir)
if f.endswith(".json") and f.startswith("research_report")
]
if not json_files:
st.info("No saved reports found.")
return
for file in sorted(json_files, reverse=True):
file_path = os.path.join(reports_dir, file)
try:
with open(file_path, 'r') as f:
with open(file_path, "r") as f:
report_data = json.load(f)
# Extract basic info
question = report_data.get("metadata", {}).get("question", "Unknown question")
project = report_data.get("metadata", {}).get("project_name", "No project")
started_at = report_data.get("metadata", {}).get("started_at", "Unknown time")
question = report_data.get("metadata", {}).get(
"question", "Unknown question"
)
project = report_data.get("metadata", {}).get(
"project_name", "No project"
)
started_at = report_data.get("metadata", {}).get(
"started_at", "Unknown time"
)
# Format the date
try:
date_obj = datetime.fromisoformat(started_at)
date_str = date_obj.strftime("%Y-%m-%d %H:%M")
except:
date_str = started_at
# Create an expandable section for each report
st.markdown(f"_{question} ({project} - {date_str})_")
st.markdown(f"**Project:** {project}")
st.markdown(f"**Date:** {date_str}")
# Button to view full report
if st.button("View Full Report", key=f"view_{file}"):
# Load corresponding markdown file if it exists
md_file = file.replace('.json', '.md')
md_file = file.replace(".json", ".md")
md_path = os.path.join(reports_dir, md_file)
if os.path.exists(md_path):
with open(md_path, 'r') as f:
with open(md_path, "r") as f:
markdown_content = f.read()
else:
markdown_content = None
self.research_state["completed"] = True
self.research_state["report"] = {
"json_path": file_path,
"markdown_path": md_path if os.path.exists(md_path) else None,
"report_data": report_data,
"markdown_content": markdown_content
"markdown_content": markdown_content,
}
st.rerun()
except Exception as e:
st.error(f"Error loading report {file}: {str(e)}")
@ -321,13 +337,13 @@ class ResearchPage(StreamlitBaseClass):
if not report_data:
st.warning("No report data available.")
return
st.title("Research Report")
# Get report data
markdown_content = report_data.get("markdown_content")
json_data = report_data.get("report_data")
if markdown_content:
# Display the markdown report
st.markdown(markdown_content)
@ -335,80 +351,91 @@ class ResearchPage(StreamlitBaseClass):
# Fallback to displaying JSON data in a more readable format
question = json_data.get("metadata", {}).get("question", "Unknown question")
st.header(f"Research on: {question}")
# Display metadata
st.subheader("Metadata")
metadata = json_data.get("metadata", {})
st.markdown(f"**Project:** {metadata.get('project_name', 'None')}")
st.markdown(f"**Started:** {metadata.get('started_at', 'Unknown')}")
st.markdown(f"**Finished:** {metadata.get('finished_at', 'Unknown')}")
# Display final report
st.subheader("Research Findings")
st.markdown(json_data.get("final_report", "No final report available."))
# Display steps
st.subheader("Research Steps")
steps = json_data.get("steps", {})
for step_name, step_data in steps.items():
with st.expander(step_name):
st.markdown(f"**Summary:** {step_data.get('summary', 'No summary available.')}")
st.markdown(
f"**Summary:** {step_data.get('summary', 'No summary available.')}"
)
# Display tools used
st.markdown("**Tools used:**")
for tool in step_data.get("tools_used", []):
st.markdown(f"- {tool.get('tool', 'Unknown tool')} with query: _{tool.get('args', {}).get('query', 'No query')}_")
st.markdown(
f"- {tool.get('tool', 'Unknown tool')} with query: _{tool.get('args', {}).get('query', 'No query')}_"
)
else:
st.error("No report content available to display.")
# Download buttons
col1, col2 = st.columns(2)
with col1:
if report_data.get("markdown_path") and os.path.exists(report_data["markdown_path"]):
if report_data.get("markdown_path") and os.path.exists(
report_data["markdown_path"]
):
with open(report_data["markdown_path"], "r") as f:
markdown_content = f.read()
st.download_button(
label="Download as Markdown",
data=markdown_content,
file_name=os.path.basename(report_data["markdown_path"]),
mime="text/markdown"
mime="text/markdown",
)
with col2:
if report_data.get("json_path") and os.path.exists(report_data["json_path"]):
if report_data.get("json_path") and os.path.exists(
report_data["json_path"]
):
with open(report_data["json_path"], "r") as f:
json_content = f.read()
st.download_button(
label="Download as JSON",
data=json_content,
file_name=os.path.basename(report_data["json_path"]),
mime="application/json"
mime="application/json",
)
def show_research_progress(self):
"""Displays the current research progress"""
st.subheader("Research in Progress")
st.markdown(f"**Question:** {self.research_state['question']}")
# Show progress bar
progress = 0
if self.research_state["total_steps"] > 0:
progress = self.research_state["steps_completed"] / self.research_state["total_steps"]
progress = (
self.research_state["steps_completed"]
/ self.research_state["total_steps"]
)
st.progress(progress)
# Show current step
current_step = self.research_state.get("current_step", "Planning")
st.markdown(f"**Current step:** {current_step}")
# Display research plan and progress in expandable sections
if self.report:
with st.expander("Research Plan", expanded=True):
if self.report.report["plan"]["original_text"]:
st.markdown("### Original Research Plan")
st.markdown(self.report.report["plan"]["original_text"])
if self.report.report["plan"]["structured"]:
st.markdown("### Structured Plan")
structured_plan = self.report.report["plan"]["structured"]
@ -416,7 +443,7 @@ class ResearchPage(StreamlitBaseClass):
st.markdown(f"**{step_name}**")
for task_name, task_description in tasks:
st.markdown(f"- {task_name}: {task_description}")
# Show completed steps
if self.report.report["steps"]:
with st.expander("Completed Steps", expanded=True):
@ -426,25 +453,29 @@ class ResearchPage(StreamlitBaseClass):
st.markdown(f"### {step_name}")
if step_data.get("summary"):
st.markdown(f"**Summary:** {step_data['summary']}")
# Show tools used
if step_data.get("tools_used"):
st.markdown("**Tools used:**")
for tool in step_data["tools_used"]:
st.markdown(f"- {tool.get('tool')} with query: _{tool.get('args', {}).get('query', 'No query')}_")
st.markdown(
f"- {tool.get('tool')} with query: _{tool.get('args', {}).get('query', 'No query')}_"
)
# Show information gathering in the current step
current_step_data = self.report.report["steps"].get(current_step, {})
if current_step_data and not current_step_data.get("finished_at"):
with st.expander("Current Step Progress", expanded=True):
st.markdown(f"### {current_step}")
# Show tools used in current step
if current_step_data.get("tools_used"):
st.markdown("**Tools used so far:**")
for tool in current_step_data["tools_used"]:
st.markdown(f"- {tool.get('tool')} with query: _{tool.get('args', {}).get('query', 'No query')}_")
st.markdown(
f"- {tool.get('tool')} with query: _{tool.get('args', {}).get('query', 'No query')}_"
)
# Show information gathered so far
if current_step_data.get("information_gathered"):
st.markdown("**Information gathered:**")
@ -454,6 +485,10 @@ class ResearchPage(StreamlitBaseClass):
if source not in sources_seen:
st.markdown(f"- {source}")
sources_seen.add(source)
st.info("Research is ongoing. This may take several minutes depending on the complexity of the question.")
st.warning("Please do not navigate away from this page while research is in progress.")
st.info(
"Research is ongoing. This may take several minutes depending on the complexity of the question."
)
st.warning(
"Please do not navigate away from this page while research is in progress."
)

@ -11,14 +11,25 @@ from _arango import ArangoDB
def get_settings():
"""
Function to get the settings from the ArangoDB.
Function to get the settings from the ArangoDB using the new API.
"""
if "username" not in st.session_state:
return {}
# Create ArangoDB instance with user's database
arango = ArangoDB(db_name=st.session_state["username"])
settings = arango.db.collection("settings").get("settings")
# Use the get_settings method from the new API
settings = arango.get_settings()
if settings:
st.session_state["settings"] = settings
else:
st.session_state["settings"] = {'current_collection': None, 'current_page': None}
# Initialize default settings if none exist
default_settings = {'current_collection': None, 'current_page': None}
arango.initialize_settings(default_settings)
st.session_state["settings"] = default_settings
return st.session_state["settings"]
@ -40,19 +51,21 @@ except LoginError as e:
st.error(e)
if st.session_state["authentication_status"]:
# Set username in session state
st.session_state["username"] = st.session_state["username"]
sleep(0.1)
# Retry mechanism for importing get_settings
for _ in range(3):
try:
get_settings()
except ImportError as e:
except Exception as e:
sleep(0.3)
print_red(e)
print("Retrying to import get_settings...")
print_red(f"Error getting settings: {e}")
print("Retrying to get settings...")
# Retry mechanism for importing pages
for _ in range(3):
try:
from streamlit_pages import (
Article_Collections,
@ -63,7 +76,6 @@ if st.session_state["authentication_status"]:
Research,
Search_Papers
)
break
except ImportError as e:
# Write the full error traceback
@ -90,7 +102,6 @@ if st.session_state["authentication_status"]:
research = st.Page(Research)
search_papers = st.Page(Search_Papers)
sleep(0.1)
pg = st.navigation([bot_chat, projects, article_collections, research, search_papers, rss_feeds, settings])
sleep(0.1)
@ -112,13 +123,14 @@ if st.session_state["authentication_status"]:
# session_state = st.session_state.to_dict()
# if 'bot' in session_state:
# del session_state['bot']
# arango.db.collection("error_logs").insert(
# {
# arango.insert_document(
# collection_name="error_logs",
# document={
# "error": traceback_string,
# "_key": timestamp,
# "session_state": session_state,
# },
# overwrite=True,
# overwrite=True
# )
# with st.status(":red[An error occurred. The site will be reloaded.]"):
# for i in range(5):

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

@ -1,31 +0,0 @@
from TTS.api import TTS
import torch
from datetime import datetime
tts = TTS("tts_models/en/multi-dataset/tortoise-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tts.to(device)
text="There is, therefore, an increasing need to understand BEVs from a systems perspective. This involves an in-depth consideration of the environmental impact of the product using life cycle assessment (LCA) as well as taking a broader 'circular economy' approach. On the one hand, LCA is a means of assessing the environmental impact associated with all stages of a product's life from cradle to grave: from raw material extraction and processing to the product's manufacture to its use in everyday life and finally to its end of life."
# cloning `lj` voice from `TTS/tts/utils/assets/tortoise/voices/lj`
# with custom inference settings overriding defaults.
time_now = datetime.now().strftime("%Y%m%d%H%M%S")
output_path = f"output/tortoise_{time_now}.wav"
tts.tts_to_file(text,
file_path=output_path,
voice_dir="voices",
speaker="test",
split_sentences=False, # Change to True if context is not enough
num_autoregressive_samples=20,
diffusion_iterations=50)
# # Using presets with the same voice
# tts.tts_to_file(text,
# file_path="output.wav",
# voice_dir="path/to/tortoise/voices/dir/",
# speaker="lj",
# preset="ultra_fast")
# # Random voice generation
# tts.tts_to_file(text,
# file_path="output.wav")

@ -0,0 +1,209 @@
#!/usr/bin/env python3
"""
Test LLM Server and View Results
This script sends a test document to the LLM server for summarization,
waits for processing to complete, and displays the results.
Usage:
python test_and_view.py [--wait SECONDS] [--retries COUNT]
Options:
--wait SECONDS Number of seconds to wait between polling attempts (default: 5)
--retries COUNT Maximum number of polling attempts (default: 20)
"""
import requests
import json
import time
import os
import argparse
import sys
from _arango import ArangoDB
def send_test_document():
"""Send a test document to the LLM server for summarization."""
print("Sending test document to LLM server...")
# Define server endpoint
url = "http://localhost:8100/summarise_document"
# Create a sample document with unique ID based on timestamp
doc_id = f"test_articles/climate_impact_{int(time.time())}"
sample_document = {
"arango_doc": {
"text": """
The Impact of Climate Change on Coral Reefs
Climate change has significantly affected marine ecosystems worldwide, with coral reefs being among the most vulnerable.
Rising sea temperatures have led to increased coral bleaching events, where corals expel their symbiotic algae,
leading to whitening and potential death. Studies show that even a 1-2°C increase in water temperature
can trigger mass bleaching events. Additionally, ocean acidification caused by increased CO2 absorption
makes it difficult for corals to build their calcium carbonate skeletons.
Recent research by Johnson et al. (2023) suggests that if current trends continue, we may lose up to 90%
of coral reefs by 2050. However, some corals have shown remarkable resilience. Certain species can adapt
to higher temperatures through a process called adaptive bleaching, where they exchange their algal symbionts
for more heat-tolerant varieties. Conservation efforts focused on cultivating these resilient species may
provide hope for reef preservation.
""",
"chunks": []
},
"arango_db_name": "test_db",
"arango_id": doc_id,
"is_sci": True
}
try:
# Send request to server
response = requests.post(url, json=sample_document)
if response.status_code == 200:
print("✓ Request accepted by server")
print(f"Document ID: {doc_id}")
return {
"db_name": "test_db",
"doc_id": doc_id
}
else:
print(f"✗ Error: {response.status_code}")
print(response.text)
return None
except Exception as e:
print(f"✗ Connection error: {e}")
return None
def poll_for_results(doc_info, max_retries=20, wait_time=5):
"""Poll the database until the document is summarized."""
if not doc_info:
return None
db_name = doc_info["db_name"]
doc_id = doc_info["doc_id"]
print(f"\nPolling for results in {db_name}/{doc_id}...")
print(f"Will check every {wait_time} seconds, up to {max_retries} times.")
arango = ArangoDB(db_name=db_name)
for attempt in range(max_retries):
print(f"Attempt {attempt+1}/{max_retries}... ", end="", flush=True)
try:
# Get the document from ArangoDB
document = arango.get_document(doc_id)
# Check if the document has been summarized
if document and "summary" in document:
print("✓ Document summary found!")
return document
print("Document exists but no summary yet")
time.sleep(wait_time)
except Exception as e:
print(f"Error: {e}")
time.sleep(wait_time)
print("\n✗ Summarization not completed after maximum retries.")
return None
def display_results(document):
"""Display the summarization results."""
if not document:
print("\nNo results to display")
return
print("\n" + "=" * 80)
print(f"RESULTS FOR DOCUMENT: {document.get('_id', 'Unknown')}")
print("=" * 80)
# Document summary
print("\n📄 DOCUMENT SUMMARY")
print("-" * 80)
print(document["summary"]["text_sum"])
# Model info if available
if "meta" in document["summary"]:
meta = document["summary"]["meta"]
model = meta.get("model", "Unknown")
temp = meta.get("temperature", "Unknown")
print(f"\nGenerated using: {model} (temperature: {temp})")
# Check for summarized chunks
if "chunks" in document and document["chunks"]:
summarized_chunks = [chunk for chunk in document["chunks"] if "summary" in chunk]
print(f"\n🧩 CHUNK SUMMARIES ({len(summarized_chunks)}/{len(document['chunks'])} chunks processed)")
for i, chunk in enumerate(summarized_chunks):
print("\n" + "-" * 80)
print(f"Chunk {i+1}:")
print("-" * 80)
print(chunk["summary"])
# Display tags
if "tags" in chunk and chunk["tags"]:
print("\nTags:", ", ".join(chunk["tags"]))
# Display references
if "references" in chunk and chunk["references"]:
print("\nReferences:")
for ref in chunk["references"]:
print(f"- {ref}")
print("\n" + "=" * 80)
# Provide links to web views
print("\nView in browser:")
print("- HTML view: http://localhost:8100/html_results")
print("- JSON view: http://localhost:8100/view_results")
def check_server_status():
"""Check if the LLM server is running."""
try:
response = requests.get("http://localhost:8100/latest_result", timeout=2)
return True
except:
return False
def main():
parser = argparse.ArgumentParser(description='Test LLM server and view results')
parser.add_argument('--wait', type=int, default=5, help='Seconds to wait between polling attempts')
parser.add_argument('--retries', type=int, default=20, help='Maximum number of polling attempts')
args = parser.parse_args()
print("LLM Server Test and View")
print("======================\n")
# Check if server is running
if not check_server_status():
print("ERROR: Cannot connect to LLM server at http://localhost:8100")
print("Make sure the server is running before continuing.")
sys.exit(1)
print("✓ Server is running\n")
# Send test document
doc_info = send_test_document()
if not doc_info:
print("Failed to send test document")
sys.exit(1)
print("\n⏳ Processing document...")
print("(This may take some time depending on model size and document complexity)")
# Poll for results
result = poll_for_results(doc_info, max_retries=args.retries, wait_time=args.wait)
# Display results
display_results(result)
if __name__ == "__main__":
main()

@ -1,51 +0,0 @@
from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
from fairseq import utils
import nltk
import torch
# Download the required NLTK resource
nltk.download('averaged_perceptron_tagger')
# Model loading
models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
"facebook/fastspeech2-en-ljspeech",
arg_overrides={"vocoder": "hifigan", "fp16": False}
)
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Move all models to the correct device
for model in models:
model.to(device)
# Update configuration and build generator after moving models
TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
generator = task.build_generator(models, cfg)
# Ensure the vocoder is on the correct device
generator.vocoder.model.to(device)
# Define your text
text = """Hi there, thanks for having me! My interest in electric cars really started back when I was a teenager..."""
# Convert text to model input
sample = TTSHubInterface.get_model_input(task, text)
# Recursively move all tensors in sample to the correct device
sample = utils.move_to_cuda(sample) if torch.cuda.is_available() else sample
# Generate speech
wav, rate = TTSHubInterface.get_prediction(task, models[0], generator, sample)
from scipy.io.wavfile import write
# If wav is a tensor, convert it to a NumPy array
if isinstance(wav, torch.Tensor):
wav = wav.cpu().numpy()
# Save the audio to a WAV file
write('output_fair.wav', rate, wav)

@ -1,91 +0,0 @@
import asyncio
import re
from pdf_highlighter import Highlighter
from _chromadb import ChromaDB
from _llm import LLM
import ollama
from colorprinter.print_color import *
from concurrent.futures import ThreadPoolExecutor
# Wrap the synchronous generate method
async def async_generate(llm, prompt):
loop = asyncio.get_event_loop()
with ThreadPoolExecutor() as pool:
return await loop.run_in_executor(pool, llm.generate, prompt)
# Define the main asynchronous function to highlight the PDFs
async def highlight_pdf(data):
# Use the highlight method to highlight the relevant sentences in the PDFs
highlighted_pdf_buffer = await highlighter.highlight(
data=data, zero_indexed_pages=True # Pages are zero-based (e.g., 0, 1, 2, ...)
)
# Save the highlighted PDF to a new file
with open("highlighted_combined_documents.pdf", "wb") as f:
f.write(highlighted_pdf_buffer.getbuffer())
print_green("PDF highlighting completed successfully!")
# Initialize ChromaDB client
chromadb = ChromaDB()
# Define the query to fetch relevant text snippets and metadata from ChromaDB
query = "How are climate researchers advocating for change in the society?"
# Perform the query on ChromaDB
result = chromadb.query(query, collection="sci_articles", n_results=5)
# Use zip to combine the lists into a list of dictionaries
results = [
{"id": id_, "metadata": metadata, "document": document, "distance": distance}
for id_, metadata, document, distance in zip(
result["ids"][0],
result["metadatas"][0],
result["documents"][0],
result["distances"][0],
)
]
for r in results:
print_rainbow(r["metadata"])
print_yellow(type(r["metadata"]['pages']))
# Ask a LLM a question about the text snippets
llm = LLM(model="small")
documents_string = "\n\n---\n\n".join(result["documents"][0])
answer = llm.generate(
f'''{query} Write your answer from the information below?\n\n"""{documents_string}"""\n\n{query}'''
)
print_green(answer)
# Now you want to highlight relevant information in the PDFs to understand what the LLM is using!
# Each result from ChromaDB contains the PDF filename and the pages where the text is found
data = []
for result in results:
pages = result["metadata"].get("pages")
try:
pages = [int(pages)]
except:
# Use re to extraxt the page numbers separated by commas
pages = list(map(int, re.findall(r"\d+", pages)))
data.append(
{
"user_input": query,
"pdf_filename": result["metadata"]["_id"],
"pages": pages,
'chunk': result['document']
}
)
# Initialize the Highlighter
highlighter = Highlighter(
llm=llm, # Pass the LLM to the Highlighter
comment=False, # Enable comments to understand the context
use_llm=False
)
# Run the main function using asyncio
asyncio.run(highlight_pdf(data))

@ -0,0 +1,191 @@
import requests
import json
import time
from _arango import ArangoDB # Import ArangoDB client to fetch results
def test_summarize_document():
"""
Test the document summarization functionality of the LLM server by sending a POST request
to the summarize_document endpoint.
This function creates a sample document, sends it to the LLM server, and then polls for results.
"""
print("Testing document summarization...")
# Define server endpoint
url = "http://localhost:8100/summarise_document"
# Create a sample document
sample_document = {
"arango_doc": {
"text": """
The Impact of Climate Change on Coral Reefs
Climate change has significantly affected marine ecosystems worldwide, with coral reefs being among the most vulnerable.
Rising sea temperatures have led to increased coral bleaching events, where corals expel their symbiotic algae,
leading to whitening and potential death. Studies show that even a 1-2°C increase in water temperature
can trigger mass bleaching events. Additionally, ocean acidification caused by increased CO2 absorption
makes it difficult for corals to build their calcium carbonate skeletons.
Recent research by Johnson et al. (2023) suggests that if current trends continue, we may lose up to 90%
of coral reefs by 2050. However, some corals have shown remarkable resilience. Certain species can adapt
to higher temperatures through a process called adaptive bleaching, where they exchange their algal symbionts
for more heat-tolerant varieties. Conservation efforts focused on cultivating these resilient species may
provide hope for reef preservation.
""",
"chunks": []
},
"arango_db_name": "test_db",
"arango_id": "articles/test_article",
"is_sci": True
}
# Send request to server
print("Sending document to server for summarization...")
response = requests.post(url, json=sample_document)
if response.status_code == 200:
print("Request accepted. Response:", response.json())
# Save values for checking results later
return {
"db_name": sample_document["arango_db_name"],
"doc_id": sample_document["arango_id"]
}
else:
print(f"Error: {response.status_code}")
print(response.text)
return None
def test_summarize_chunks():
"""
Test the chunk summarization functionality directly by creating a sample document with chunks.
In a real application, you'd typically query the results from the database after processing.
"""
print("\nTesting chunk summarization example...")
# Sample document with chunks
sample_document_with_chunks = {
"arango_doc": {
"text": "",
"chunks": [
{
"text": "Climate change has significantly affected marine ecosystems worldwide, with coral reefs being among the most vulnerable. Rising sea temperatures have led to increased coral bleaching events.",
"pages": [1]
},
{
"text": "Studies by Smith et al. [1] show that even a 1-2°C increase in water temperature can trigger mass bleaching events. Additionally, ocean acidification makes it difficult for corals to build their calcium carbonate skeletons.",
"pages": [1, 2]
}
]
},
"arango_db_name": "test_db",
"arango_id": "interviews/test_interview",
"is_sci": False
}
url = "http://localhost:8100/summarise_document"
print("Sending document with chunks for summarization...")
response = requests.post(url, json=sample_document_with_chunks)
if response.status_code == 200:
print("Request accepted. Response:", response.json())
return {
"db_name": sample_document_with_chunks["arango_db_name"],
"doc_id": sample_document_with_chunks["arango_id"]
}
else:
print(f"Error: {response.status_code}")
print(response.text)
return None
def poll_for_results(doc_info, max_retries=10, wait_time=5):
"""
Poll the ArangoDB database to check if the document has been summarized.
Args:
doc_info (dict): Dictionary containing db_name and doc_id
max_retries (int): Maximum number of polling attempts
wait_time (int): Time to wait between polling attempts (seconds)
Returns:
dict or None: The document with summaries if available, None otherwise
"""
if not doc_info:
return None
db_name = doc_info["db_name"]
doc_id = doc_info["doc_id"]
print(f"\nPolling for results in {db_name}/{doc_id}...")
arango = ArangoDB(db_name=db_name)
for attempt in range(max_retries):
print(f"Attempt {attempt+1}/{max_retries}...")
try:
# Get the document from ArangoDB
document = arango.get_document(doc_id)
# Check if the document has been summarized
if document and "summary" in document:
print("✓ Document summary found!")
print("-" * 50)
print("Document Summary:")
print("-" * 50)
print(document["summary"]["text_sum"])
print("-" * 50)
# Check if chunks have been summarized
if "chunks" in document and document["chunks"] and "summary" in document["chunks"][0]:
print("✓ Chunk summaries found!")
print("-" * 50)
print("First Chunk Summary:")
print("-" * 50)
print(document["chunks"][0]["summary"])
print("-" * 50)
if len(document["chunks"]) > 1:
print("Tags:", document["chunks"][0]["tags"])
return document
# If we haven't found summaries yet, wait and try again
time.sleep(wait_time)
except Exception as e:
print(f"Error checking document: {e}")
time.sleep(wait_time)
print("❌ Summarization not completed after maximum retries.")
return None
if __name__ == "__main__":
print("LLM Server Test Script")
print("=====================\n")
# Test if server is running
try:
requests.get("http://localhost:8100")
print("Server is running at http://localhost:8100\n")
except requests.exceptions.ConnectionError:
print("ERROR: Cannot connect to server at http://localhost:8100")
print("Make sure the server is running before continuing.\n")
exit(1)
# Run tests and store document info for polling
doc1_info = test_summarize_document()
time.sleep(2) # Brief pause between tests
doc2_info = test_summarize_chunks()
print("\nWaiting for background tasks to complete...")
print("This may take some time depending on LLM response speed.")
# Poll for results (with longer wait time for the first document which needs to be chunked)
poll_for_results(doc1_info, max_retries=20, wait_time=6)
poll_for_results(doc2_info, max_retries=12, wait_time=5)
print("\nTest script completed.")
print("If you didn't see results, the background tasks might still be processing.")
print("You can run this script again later to check, or query the database directly.")

@ -1,38 +0,0 @@
import os
import base64
from ollama import Client, ChatResponse
import env_manager
from colorprinter.print_color import *
import httpx
env_manager.set_env()
# Encode the credentials
auth = httpx.BasicAuth(
username='lasse', password=os.getenv("LLM_API_PWD_LASSE")
)
client = httpx.Client(auth=auth)
client = Client(
host="http://localhost:11434",
headers={
"X-Chosen-Backend": "backend_ollama" # Add this header to specify the chosen backend
},
auth=auth
)
response = client.chat(
model=os.getenv("LLM_MODEL"),
messages=[
{
"role": "user",
"content": "Why is the sky blue?",
},
],
)
# Print the response headers
# Print the chosen backend from the headers
print("Chosen Backend:", response.headers.get("X-Chosen-Backend"))
# Print the response content
print(response)

@ -1,9 +0,0 @@
from _llm import LLM
llm = LLM()
image = '/home/lasse/sci/test_image.png'
image_bytes = open(image, 'rb').read()
print(type(image_bytes))
response = llm.generate('What is this?', images=[image_bytes])
print(response)

@ -1,206 +0,0 @@
from _llm import LLM
from _arango import ArangoDB
from _chromadb import ChromaDB
from streamlit_chatbot import Bot
from pydantic import BaseModel, Field
from typing import Dict, List, Tuple
from colorprinter.print_color import *
from projects_page import Project
from _base_class import StreamlitBaseClass
from prompts import get_tools_prompt
class ResearchBase(Bot):
def __init__(self, username, **args):
super().__init__(username=username, **args)
self.llm = LLM()
self.arango = ArangoDB()
self.chromadb = ChromaDB()
self.messages = []
def start(self):
self.messages = [{"role": "system", "message": self.llm.system_message}]
if self.llm.model in ["small", "standard", "vision", "reasoning", "tools"]:
self.llm.get_model(self.llm.model)
class ResearchManager(ResearchBase):
def __init__(self, username, project=None):
super().__init__(username=username, project=project)
self.llm.system_message = "You are an assistant helping a journalist writing a report based on extensive research."
self.llm.model = "reasoning"
self.start()
def generate_plan(self, question):
query = f"""
A journalist wants to get a report that answers this question: "{question}"
THIS IS *NOT* A QUESTION YOU CAN ANSWER! Instead, you need to make a plan for how to answer this question.
Include what type of information you need from what available sources.
Available sources are:
- Scientific articles
- Other articles the journalists has gathered, such as blog posts, news articles, etc.
- The journalists own notes.
- Transcribed interviews (already done, you can't produce new ones).
All of the above sources are available in a database, but you need to specify what you need. Be as precise as possible.
As you don't have access to the sources, include steps to retrieve excerpts from articles and retrieve those that might be interesting.
Also include steps to verify the information.
Make the plan easy to follow and structured.
Remember: You are not answering the question, you are making *a plan* for how to answer the question using the available sources.
"""
query += f"\nTo help you understand the subject, here is a summary of notes the journalist has done: {project.notes_summary}"
query += """Please structure the plan like:
## Step 1:
- Task1: Description of task
- Task2: Description of task
## Step 2:
- Task1: Description of task
- Task2: Description of task
Etc, with as many steps and tasks as needed.
"""
return self.llm.generate(query).content
class ResearchAssistant(ResearchBase):
def __init__(self, username):
super().__init__(username)
self.llm.system_message = "You are a Research Assistant"
self.start()
class HelperBot(ResearchBase):
def __init__(self, username):
super().__init__(username)
self.llm.system_message = "You are helping a researcher to structure a text. You will get a text and make it into structured data. Make sure not to change the meaning of the text and keeps all the details in the subtasks."
self.llm.model = "small"
self.start()
def make_structured_plan(self, text, question=None):
class Plan(BaseModel):
steps: Dict[str, List[Tuple[str, str]]] = Field(
description="Structured plan represented as steps with their corresponding tasks or facts",
example={
"Step 1: Gather Existing Materials": [
("Task 1", "Description of task"),
("Task 2", "Description of task"),
],
"Step 2: Extract Relevant Information": [
("Task 1", "Description of task"),
("Task 2", "Description of task"),
],
},
)
if question:
query = f''' This is a proposed plan for how to write a report on "{question}":\n"""{text}"""\nPlease make the plan into structured data with subtasks. Make sure to keep all the details in the subtasks.'''
else:
query = f''' This is a proposed plan for how to write a report:\n"""{text}"""\nPlease make the plan into structured data with subtasks. Make sure to keep all the details in the subtasks.'''
response = self.llm.generate(query, format=Plan.model_json_schema())
print(response)
structured_response = Plan.model_validate_json(response.content)
print('PLAN')
print_rainbow(structured_response)
print()
return structured_response
class ToolBot(ResearchBase):
def __init__(self, username, tools: list):
super().__init__(username, tools=tools)
self.start()
tools_names = [tool.__name__ for tool in self.tools]
tools_name_string = "\n".join(tools_names)
self.llm = LLM(
temperature=0,
system_message=f"""
You are an helpful assistant with tools. The tools you can choose from are:
{tools_name_string}
Your task is to choose one or multiple tools to answering a user's query.
DON'T come up with your own tools, only use the ones provided.
""",
chat=False,
model="tools",
)
def propose_tools(self, task):
query = f"""What tool(s) would you use to help with this task:
"{task}"
Answer in a structured way using the tool_calls field!
"""
query = get_tools_prompt(task)
response = self.llm.generate(query)
print_yellow('Model:', self.llm.model)
print_rainbow(response)
return response.tool_calls
if __name__ == "__main__":
base = StreamlitBaseClass(username="lasse")
project = Project(
username="lasse",
project_name="Monarch butterflies",
user_arango=base.get_arango(),
)
rm = ResearchManager(username="lasse", project=project)
tb = ToolBot(
username="lasse",
tools=[
"fetch_science_articles_tool",
"fetch_notes_tool",
"fetch_other_documents_tool",
"fetch_science_articles_and_other_documents_tool",
]
)
# ra = ResearchAssistant(username="lasse")
hb = HelperBot(username="lasse")
question = "Tell me five interesting facts about the Monarch butterfly"
# Generate plan
plan = rm.generate_plan(question)
# -- Example of what a plan can look like --
# plan = """## Step-by-Step Plan for Answering the Question: "Tell Me Five Interesting Facts About the Monarch Butterfly"
# ### Step 1: Gather and Organize Existing Materials
# - **Task 1:** Retrieve all existing materials related to Monarch butterflies from the database using keywords such as "Monarch butterfly migration," "habitat loss," "milkweed," "insecticides," "climate change," "Monarch Butterfly Biosphere Reserve," and "migration patterns."
# - **Task 2:** Categorize these materials into scientific articles, other articles (blogs, news), own notes, and transcribed interviews for easy access.
# ### Step 2: Extract Relevant Excerpts
# - **Task 1:** From the retrieved scientific articles, extract information on migration patterns, genetic studies, and population decline factors.
# - **Task 2:** From blogs and news articles, look for interesting anecdotes or recent findings about conservation efforts and unique behaviors of Monarch butterflies.
# ### Step 3: Identify Potential Interesting Facts
# - **Task 1:** Review the extracted excerpts to identify potential facts such as migration patterns, threats faced by Monarchs, population decline statistics, conservation efforts, and unique behaviors.
# - **Task 2:** Compile a list of five compelling and accurate facts based on the extracted information.
# ### Step 4: Verify Information
# - **Task 1:** Cross-check each fact with multiple sources to ensure accuracy. For example, verify migration details across scientific articles and recent news reports.
# - **Task 2:** Look for consensus among sources regarding population trends and threats to Monarchs.
# ### Step 5: Structure the Report
# - **Task 1:** Organize the five selected facts into a coherent structure, ensuring each fact is clearly explained and engaging.
# - **Task 2:** Incorporate quotes or statistics from sources to add depth and credibility to each fact.
# ### Step 6: Review and Finalize
# - **Task 1:** Proofread the report for clarity, accuracy, and grammar.
# - **Task 2:** Ensure all information is presented in an engaging manner suitable for a journalistic report.
# This plan ensures that the journalist systematically gathers, verifies, and presents five interesting facts about Monarch butterflies, providing a comprehensive and accurate report.
# """
#print_blue(plan)
if "</think>" in plan:
plan = plan.split("</think>")[1]
# Make structured plan
structured_plan = hb.make_structured_plan(plan, question)
for step, tasks in structured_plan.steps.items():
print_blue("\n### Step:", step)
for task in tasks:
print_blue("Task:", task[0])
print_yellow(task[1])
tools = tb.propose_tools(task[1])
print_green("Tools:", tools)
print('\n')

@ -0,0 +1,123 @@
import requests
import json
import time
def test_summarize_document():
"""
Test the document summarization functionality of the LLM server by sending a POST request
to the summarize_document endpoint.
This function creates a sample document, sends it to the LLM server, and then polls for results.
"""
print("Testing document summarization...")
# Define server endpoint
url = "http://localhost:8100/summarise_document"
# Create a sample document
sample_document = {
"arango_doc": {
"text": """
The Impact of Climate Change on Coral Reefs
Climate change has significantly affected marine ecosystems worldwide, with coral reefs being among the most vulnerable.
Rising sea temperatures have led to increased coral bleaching events, where corals expel their symbiotic algae,
leading to whitening and potential death. Studies show that even a 1-2°C increase in water temperature
can trigger mass bleaching events. Additionally, ocean acidification caused by increased CO2 absorption
makes it difficult for corals to build their calcium carbonate skeletons.
Recent research by Johnson et al. (2023) suggests that if current trends continue, we may lose up to 90%
of coral reefs by 2050. However, some corals have shown remarkable resilience. Certain species can adapt
to higher temperatures through a process called adaptive bleaching, where they exchange their algal symbionts
for more heat-tolerant varieties. Conservation efforts focused on cultivating these resilient species may
provide hope for reef preservation.
""",
"chunks": []
},
"arango_db_name": "test_db",
"arango_id": "articles/test_article",
"is_sci": True
}
# Send request to server
print("Sending document to server for summarization...")
response = requests.post(url, json=sample_document)
if response.status_code == 200:
print("Request accepted. Response:", response.json())
# In a real-world scenario, you might poll the database to see when the summary is ready
print("Note: In a real implementation, you would check the database for results.")
print("Since this is just a test, we're showing how the request works.")
return True
else:
print(f"Error: {response.status_code}")
print(response.text)
return False
def test_summarize_chunks():
"""
Test the chunk summarization functionality directly by creating a sample document with chunks.
In a real application, you'd typically query the results from the database after processing.
"""
print("\nTesting chunk summarization example...")
# Sample document with chunks
sample_document_with_chunks = {
"arango_doc": {
"text": "",
"chunks": [
{
"text": "Climate change has significantly affected marine ecosystems worldwide, with coral reefs being among the most vulnerable. Rising sea temperatures have led to increased coral bleaching events.",
"pages": [1]
},
{
"text": "Studies by Smith et al. [1] show that even a 1-2°C increase in water temperature can trigger mass bleaching events. Additionally, ocean acidification makes it difficult for corals to build their calcium carbonate skeletons.",
"pages": [1, 2]
}
]
},
"arango_db_name": "test_db",
"arango_id": "interviews/test_interview",
"is_sci": False
}
# In a real implementation, you would:
# 1. Send this document to the server
# 2. Check the database later to see the summarized chunks
url = "http://localhost:8100/summarise_document"
print("Sending document with chunks for summarization...")
response = requests.post(url, json=sample_document_with_chunks)
if response.status_code == 200:
print("Request accepted. Response:", response.json())
return True
else:
print(f"Error: {response.status_code}")
print(response.text)
return False
if __name__ == "__main__":
print("LLM Server Test Script")
print("=====================\n")
# Test if server is running
try:
requests.get("http://localhost:8100")
print("Server is running at http://localhost:8100\n")
except requests.exceptions.ConnectionError:
print("ERROR: Cannot connect to server at http://localhost:8100")
print("Make sure the server is running before continuing.\n")
exit(1)
# Run tests
test_summarize_document()
time.sleep(2) # Brief pause between tests
test_summarize_chunks()
print("\nTest script completed. Check your ArangoDB instance for results.")
print("Note: Document summarization happens in background tasks, so results may not be immediate.")
print("You would typically query the database to see the updated documents with summaries.")

@ -1,45 +0,0 @@
import torch
from TTS.api import TTS
from datetime import datetime
# Get device
from TTS.tts.utils.speakers import SpeakerManager
device = "cuda" if torch.cuda.is_available() else "cpu"
# Init TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
exit()
text = """Hi there, thanks for having me! My interest in electric cars really started back when I was a teenager. I remember learning about the history of EVs and how they've been around since the late 1800s, even before gasoline cars took over. The fact that these vehicles could run on electricity instead of fossil fuels just fascinated me.
Then, in the 90s, General Motors introduced the EV1 - it was a real game-changer. It showed that electric cars could be practical and enjoyable to drive. And when Tesla came along with their Roadster in 2007, proving that EVs could have a long range, I was hooked.
But what really sealed my interest was learning about the environmental impact of EVs. They produce zero tailpipe emissions, which means they can help reduce air pollution and greenhouse gas emissions. That's something I'm really passionate about.
"""
text_se = """Antalet bilar ger dock bara en del av bilden. För att förstå bilberoendet bör vi framför allt titta på hur mycket bilarna faktiskt används.
Stockholmarnas genomsnittliga körsträcka med bil har minskat sedan millennieskiftet. Den är dock lägre i Göteborg och i Malmö.
I procent har bilanvändningen sedan år 2000 minskat lika mycket i Stockholm och Malmö, 9 procent. I Göteborg är minskningen 13 procent, i riket är minskningen 7 procent."""
# Run TTS
# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
# Text to speech list of amplitude values as output
#wav = tts.tts(text=text, speaker_wav="my/cloning/audio.wav", language="en")
# Text to speech to a file
time_now = datetime.now().strftime("%Y%m%d%H%M%S")
output_path = f"output/tts_{time_now}.wav"
tts.tts_to_file(text=text, speaker_wav='voices/test/test_en.wav', language="en", file_path=output_path)
# api = TTS("tts_models/se/fairseq/vits")
# api.tts_with_vc_to_file(
# text_se,
# speaker_wav="test_audio_se.wav",
# file_path="output_se.wav"
# )

@ -1,22 +0,0 @@
import requests
# Define the server URL
server_url = "http://localhost:5002/api/tts"
# Define the payload
payload = {
"text": "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
"speaker": "Ana Florence",
"language": "en",
"split_sentences": True
}
# Send the request to the TTS server
response = requests.post(server_url, json=payload)
# Save the response audio to a file
if response.status_code == 200:
with open("output.wav", "wb") as f:
f.write(response.content)
else:
print(f"Error: {response.status_code}")

@ -1,33 +0,0 @@
from TTS.tts.configs.tortoise_config import TortoiseConfig
from TTS.tts.models.tortoise import Tortoise
import torch
import os
import torchaudio
# Initialize Tortoise model
config = TortoiseConfig()
model = Tortoise.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="tts_models/en/multi-dataset/tortoise-v2", eval=True)
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)
# Define the text and voice directory
text = "There is, therefore, an increasing need to understand BEVs from a systems perspective."
voice_dir = "voices"
speaker = "test"
# Load voice samples
voice_samples = []
for file_name in os.listdir(os.path.join(voice_dir, speaker)):
file_path = os.path.join(voice_dir, speaker, file_name)
waveform, sample_rate = torchaudio.load(file_path)
voice_samples.append(waveform)
# Get conditioning latents
conditioning_latents = model.get_conditioning_latents(voice_samples)
# Save conditioning latents to a file
torch.save(conditioning_latents, "conditioning_latents.pth")

@ -13,4 +13,96 @@ def fix_key(_key: str) -> str:
Returns:
str: The sanitized key with disallowed characters replaced by underscores.
"""
return re.sub(r"[^A-Za-z0-9_\-\.@()+=;$!*\'%:]", "_", _key)
return re.sub(r"[^A-Za-z0-9_\-\.@()+=;$!*\'%:]", "_", _key)
def is_reference_chunk(text: str) -> bool:
"""
Determine if a text chunk consists PREDOMINANTLY of references or end matter.
Conservative approach: only returns True for chunks that are clearly mostly references.
Args:
text (str): Text chunk to analyze
Returns:
bool: True if the chunk appears to be mostly references/end matter
"""
# Split text into lines for analysis
lines = [line.strip() for line in text.split('\n') if line.strip()]
if not lines:
return False
# First, check for unambiguous reference chunks (many DOIs or reference links)
doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+"
doi_matches = len(re.findall(doi_pattern, text))
refhub_matches = len(re.findall(r'http://refhub\.elsevier\.com/\S+', text))
# If there are many DOIs or refhub links, it's almost certainly primarily references
if doi_matches >= 15 or refhub_matches >= 10:
return True
# Find positions of common end matter section headers
end_matter_patterns = [
r"\*\*Credit author statement\*\*",
r"\*\*Declaration of competing interest\*\*",
r"\*\*Acknowledgment\*\*",
r"\*\*Acknowledgement\*\*",
r"\*\*Appendix\b.*\*\*",
r"\*\*References\*\*",
r"^References[\s]*$"
]
# Try to identify where end matter begins
end_matter_positions = []
for pattern in end_matter_patterns:
matches = list(re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE))
for match in matches:
end_matter_positions.append(match.start())
# If we found end matter sections
if end_matter_positions:
# Find the earliest end matter position
first_end_matter = min(end_matter_positions)
# Calculate ratio of substantive content
substantive_ratio = first_end_matter / len(text)
# If less than 30% of the chunk is substantive content, filter it
# This is conservative - only filter if the chunk is predominantly end matter
if substantive_ratio < 0.10:
return True
else:
# There's significant substantive content before end matter
return False
# Count reference indicators
reference_indicators = 0
# Citation patterns with year, volume, pages
citation_patterns = len(re.findall(r'\d{4};\d+:\d+[-–]\d+', text))
reference_indicators += citation_patterns * 2
# Check for lines starting with citation numbers
lines_starting_with_citation = 0
for line in lines:
if re.match(r'^\s*\[\d+\]', line):
lines_starting_with_citation += 1
# If more than half the lines start with reference numbers, it's a reference list
if lines_starting_with_citation > len(lines) / 2:
return True
# Check for abbreviation list (only if it makes up most of the chunk)
abbreviation_lines = 0
for line in lines:
if re.match(r'^[A-Z0-9]{2,}\s+[A-Z][a-z]+', line):
abbreviation_lines += 1
# If more than 70% of lines are abbreviations, it's an abbreviation list
if abbreviation_lines > len(lines) * 0.7:
return True
# Conservative approach: only filter if it's clearly mostly references
return False

@ -0,0 +1,111 @@
#!/usr/bin/env python3
"""
View Latest LLM Server Results
This script displays the latest document summaries generated by the LLM server
directly in the terminal, providing a quick way to check results without
having to use a web browser.
Usage:
python view_latest_results.py [--raw] [--json]
Options:
--raw Display the raw result data
--json Format the output as JSON
"""
import json
import os
import sys
import argparse
from datetime import datetime
def load_latest_result():
"""Load the latest result from the JSON file."""
latest_result_file = os.path.join(os.path.dirname(__file__), "latest_summary_result.json")
try:
if os.path.exists(latest_result_file):
with open(latest_result_file, 'r') as f:
return json.load(f)
else:
print(f"No results file found at {latest_result_file}")
return None
except Exception as e:
print(f"Error loading results: {e}")
return None
def display_raw(result):
"""Display the raw result data."""
print(json.dumps(result, indent=2))
def display_formatted(result):
"""Display the result in a nicely formatted way."""
if not result:
print("No results available")
return
print("\n" + "=" * 80)
print(f"DOCUMENT: {result.get('_id', 'Unknown')}")
print("=" * 80)
# Document summary
summary = result.get("summary", {}).get("text_sum", "No summary available")
print("\n📄 DOCUMENT SUMMARY")
print("-" * 80)
print(summary)
# Model info if available
if "summary" in result and "meta" in result["summary"]:
meta = result["summary"]["meta"]
model = meta.get("model", "Unknown")
temp = meta.get("temperature", "Unknown")
print(f"\nGenerated using: {model} (temperature: {temp})")
# Display chunks
chunks = result.get("chunks", [])
if chunks:
summarized_chunks = [chunk for chunk in chunks if "summary" in chunk]
print(f"\n🧩 CHUNK SUMMARIES ({len(summarized_chunks)}/{len(chunks)} chunks processed)")
for i, chunk in enumerate(summarized_chunks):
print("\n" + "-" * 80)
print(f"Chunk {i+1}:")
print("-" * 80)
print(chunk["summary"])
# Display tags
if "tags" in chunk and chunk["tags"]:
print("\nTags:", ", ".join(chunk["tags"]))
# Display references
if "references" in chunk and chunk["references"]:
print("\nReferences:")
for ref in chunk["references"]:
print(f"- {ref}")
print("\n" + "=" * 80)
def main():
parser = argparse.ArgumentParser(description='View latest LLM server results')
parser.add_argument('--raw', action='store_true', help='Display raw result data')
parser.add_argument('--json', action='store_true', help='Format output as JSON')
args = parser.parse_args()
result = load_latest_result()
if not result:
print("No results available")
return
if args.raw or args.json:
display_raw(result)
else:
display_formatted(result)
if __name__ == "__main__":
main()
Loading…
Cancel
Save