Add models, testing scripts, and result viewing functionality

- Implemented Pydantic models for article processing and summarization. - Created `test_and_view.py` for testing LLM server document summarization. - Developed `test_llm_server.py` for unit testing summarization functionality. - Added `test_server.py` for additional testing of document and chunk summarization. - Introduced `view_latest_results.py` to display the latest summaries from the LLM server. - Established a structured plan for handling document chunks and their metadata. - Enhanced error handling and user feedback in testing scripts.
9 months ago · 62b68c3717
parent 5ee1a062f1
commit 62b68c3717
35 changed files with 6481 additions and 2567 deletions
--- a/_arango.py
+++ b/_arango.py
@ -1,75 +1,950 @@
-import re
-from arango import ArangoClient
-from dotenv import load_dotenv
 import os
+from datetime import datetime
+
+from dotenv import load_dotenv
+from arango import ArangoClient
+from arango.collection import StandardCollection as ArangoCollection
+
+from models import UnifiedDataChunk, UnifiedSearchResults
+from utils import fix_key
+
 if "INFO" not in os.environ:
    import env_manager
+
    env_manager.set_env()

 load_dotenv()  # Install with pip install python-dotenv

+COLLECTIONS_IN_BASE = [
+    "sci_articles",
+]
+

 class ArangoDB:
-    def __init__(self, user=None, password=None, db_name=None):
+    """
+    ArangoDB Client Wrapper
+    This class provides a wrapper around the ArangoClient to simplify working with ArangoDB databases 
+    and collections in a scientific document management context. It handles authentication, database
+    connections, and provides high-level methods for common operations.
+    Key features:
+    - Database and collection management
+    - Document CRUD operations (Create, Read, Update, Delete)
+    - AQL query execution
+    - Scientific article storage and retrieval
+    - Project and note management
+    - Chat history storage
+    - Settings management
+    Usage example:
+        arango = ArangoDB(user="admin", password="password")
+        # Create a collection
+        arango.create_collection("my_collection")
+        # Insert a document
+        doc = arango.insert_document("my_collection", {"name": "Test Document"})
+        # Query documents
+        results = arango.execute_aql("FOR doc IN my_collection RETURN doc")
+    Environment variables:
+        ARANGO_HOST: The ArangoDB host URL
+        ARANGO_PASSWORD: The default password for authentication
+    """
+    def __init__(self, user="admin", password=None, db_name="base"):
        """
-        Initializes an instance of the ArangoClass.
-
-        Args:
-            db_name (str): The name of the database.
-            username (str): The username for authentication.
-            password (str): The password for authentication.
+        Initialize a connection to an ArangoDB database.
+        This constructor establishes a connection to an ArangoDB instance using the provided
+        credentials and database name. It uses environment variables for host and password
+        if not explicitly provided.
+        Parameters
+        ----------
+        user : str, optional
+            Username for database authentication. Defaults to "admin".
+            If db_name is not "base", then user will be set to db_name.
+        password : str, optional
+            Password for database authentication. If not provided, 
+            the password will be retrieved from the ARANGO_PASSWORD environment variable.
+        db_name : str, optional
+            Name of the database to connect to. Defaults to "base".
+            If not "base", this value will also be used as the username.
+        Notes
+        -----
+        - The host URL is always retrieved from the ARANGO_HOST environment variable.
+        - For the "base" database, the username will be either "admin" or the provided user.
+        - For other databases, the username will be the same as the database name.
+        Attributes
+        ----------
+        user : str
+            The username used for authentication.
+        password : str
+            The password used for authentication.
+        db_name : str
+            The name of the connected database.
+        client : ArangoClient
+            The ArangoDB client instance.
+        db : Database
+            The database instance for executing operations.
        """
-    
+
        host = os.getenv("ARANGO_HOST")
        if not password:
-            password = os.getenv("ARANGO_PASSWORD")
-        if not db_name:
-            if user:
-                db_name = user
-            else:
-                db_name = os.getenv("ARANGO_DB")
-        if not user:
-            user = os.getenv("ARANGO_USER")
+            self.password = os.getenv("ARANGO_PASSWORD")
+        # This is the default user for the base database
+        if db_name != "base":
+            self.user = db_name
+            self.db_name = db_name
+        
+        elif user == "admin":
+            self.user = "admin"
+            self.db_name = "base"
+        else:
+            self.user = user
+            self.db_name = user

        self.client = ArangoClient(hosts=host)
-        if user=='lasse': #! This need to be fixed to work with all users!
-            password = os.getenv("ARANGO_PWD_LASSE")
-        self.db = self.client.db(db_name, username=user, password=password)
+        self.db = self.client.db(
+            self.db_name, username=self.user, password=self.password
+        )
+
+    def fix_key(self, _key):
+        return fix_key(_key)

+    # Collection operations
+    def get_collection(self, collection_name: str) -> ArangoCollection:
+        """
+        Get a collection by name.

+        Args:
+            collection_name (str): The name of the collection.

-    def fix_key(self, _key):
+        Returns:
+            ArangoCollection: The collection object.
+        """
+        return self.db.collection(collection_name)
+
+    def has_collection(self, collection_name: str) -> bool:
+        """
+        Check if a collection exists.
+
+        Args:
+            collection_name (str): The name of the collection.
+
+        Returns:
+            bool: True if the collection exists, False otherwise.
+        """
+        return self.db.has_collection(collection_name)
+
+    def create_collection(self, collection_name: str) -> ArangoCollection:
+        """
+        Create a new collection.
+
+        Args:
+            collection_name (str): The name of the collection to create.
+
+        Returns:
+            ArangoCollection: The created collection.
+        """
+        return self.db.create_collection(collection_name)
+
+    def delete_collection(self, collection_name: str) -> bool:
+        """
+        Delete a collection.
+
+        Args:
+            collection_name (str): The name of the collection to delete.
+
+        Returns:
+            bool: True if the collection was deleted successfully.
+        """
+        if self.has_collection(collection_name):
+            return self.db.delete_collection(collection_name)
+        return False
+
+    def truncate_collection(self, collection_name: str) -> bool:
+        """
+        Truncate a collection (remove all documents).
+
+        Args:
+            collection_name (str): The name of the collection to truncate.
+
+        Returns:
+            bool: True if the collection was truncated successfully.
+        """
+        if self.has_collection(collection_name):
+            return self.db.collection(collection_name).truncate()
+        return False
+
+    # Document operations
+    def get_document(self, document_id: str):
+        """
+        Get a document by ID.
+
+        Args:
+            document_id (str): The ID of the document to get.
+
+        Returns:
+            dict: The document if found, None otherwise.
+        """
+        try:
+            return self.db.document(document_id)
+        except:
+            return None
+
+    def has_document(self, collection_name: str, document_key: str) -> bool:
+        """
+        Check if a document exists in a collection.
+
+        Args:
+            collection_name (str): The name of the collection.
+            document_key (str): The key of the document.
+
+        Returns:
+            bool: True if the document exists, False otherwise.
+        """
+        return self.db.collection(collection_name).has(document_key)
+
+    def insert_document(
+        self,
+        collection_name: str,
+        document: dict,
+        overwrite: bool = False,
+        overwrite_mode: str = "update",
+        keep_none: bool = False,
+    ):
+        """
+        Insert a document into a collection.
+
+        Args:
+            collection_name (str): The name of the collection.
+            document (dict): The document to insert.
+            overwrite (bool, optional): Whether to overwrite an existing document. Defaults to False.
+            overwrite_mode (str, optional): The mode for overwriting ('replace' or 'update'). Defaults to "replace".
+            keep_none (bool, optional): Whether to keep None values. Defaults to False.
+
+        Returns:
+            dict: The inserted document with its metadata (_id, _key, etc.)
+        """
+        assert '_id' in document or '_key' in document, "Document must have either _id or _key"
+        if '_id' not in document:
+            document['_id'] = f"{collection_name}/{document['_key']}"
+            
+        return self.db.collection(collection_name).insert(
+            document,
+            overwrite=overwrite,
+            overwrite_mode=overwrite_mode,
+            keep_none=keep_none,
+        )
+
+    def update_document(
+        self, document: dict, check_rev: bool = False, silent: bool = False
+    ):
+        """
+        Update a document that already has _id or _key.
+
+        Args:
+            document (dict): The document to update.
+            check_rev (bool, optional): Whether to check document revision. Defaults to False.
+            silent (bool, optional): Whether to return the updated document. Defaults to False.
+
+        Returns:
+            dict: The updated document if silent is False.
+        """
+        return self.db.update_document(document, check_rev=check_rev, silent=silent)
+
+    def update_document_by_match(
+        self, collection_name: str, filters: dict, body: dict, merge: bool = True
+    ):
+        """
+        Update documents that match a filter.
+
+        Args:
+            collection_name (str): The name of the collection.
+            filters (dict): The filter to match documents.
+            body (dict): The update to apply.
+            merge (bool, optional): Whether to merge the update with existing data. Defaults to True.
+
+        Returns:
+            dict: The result of the update operation.
+        """
+        return self.db.collection(collection_name).update_match(
+            filters=filters, body=body, merge=merge
+        )
+
+    def delete_document(self, collection_name: str, document_key: str):
+        """
+        Delete a document from a collection.
+
+        Args:
+            collection_name (str): The name of the collection.
+            document_key (str): The key of the document to delete.
+
+        Returns:
+            dict: The deletion result.
+        """
+        return self.db.collection(collection_name).delete(document_key)
+
+    def delete_document_by_match(self, collection_name: str, filters: dict):
+        """
+        Delete documents that match a filter.
+
+        Args:
+            collection_name (str): The name of the collection.
+            filters (dict): The filter to match documents.
+
+        Returns:
+            dict: The deletion result.
+        """
+        return self.db.collection(collection_name).delete_match(filters=filters)
+
+    # Query operations
+    def execute_aql(self, query: str, bind_vars: dict = None):
+        """
+        Execute an AQL query.
+
+        Args:
+            query (str): The AQL query to execute.
+            bind_vars (dict, optional): Bind variables for the query. Defaults to None.
+
+        Returns:
+            Cursor: A cursor to the query results.
+        """
+        return self.db.aql.execute(query, bind_vars=bind_vars)
+
+    def get_all_documents(self, collection_name: str):
+        """
+        Get all documents from a collection.
+
+        Args:
+            collection_name (str): The name of the collection.
+
+        Returns:
+            list: All documents in the collection.
+        """
+        return list(self.db.collection(collection_name).all())
+
+    # Database operations
+    def has_database(self, db_name: str) -> bool:
+        """
+        Check if a database exists.
+
+        Args:
+            db_name (str): The name of the database.
+
+        Returns:
+            bool: True if the database exists, False otherwise.
+        """
+        return self.client.has_database(db_name)
+
+    def create_database(self, db_name: str, users: list = None) -> bool:
+        """
+        Create a new database.
+
+        Args:
+            db_name (str): The name of the database to create.
+            users (list, optional): List of user objects with access to the database. Defaults to None.
+
+        Returns:
+            bool: True if the database was created successfully.
+        """
+        return self.client.create_database(db_name, users=users)
+
+    def delete_database(self, db_name: str) -> bool:
+        """
+        Delete a database.
+
+        Args:
+            db_name (str): The name of the database to delete.
+
+        Returns:
+            bool: True if the database was deleted successfully.
+        """
+        if self.client.has_database(db_name):
+            return self.client.delete_database(db_name)
+        return False
+
+    # Domain-specific operations
+
+    # Scientific Articles
+    def get_article(
+        self,
+        article_key: str,
+        db_name: str = None,
+        collection_name: str = "sci_articles",
+    ):
+        """
+        Get a scientific article by key.
+
+        Args:
+            article_key (str): The key of the article.
+            db_name (str, optional): The database name to search in. Defaults to current database.
+
+        Returns:
+            dict: The article document if found, None otherwise.
+        """
+        try:
+            return self.db.collection("sci_articles").get(article_key)
+        except Exception as e:
+            print(f"Error retrieving article {article_key}: {e}")
+            raise e
+            return None
+
+    def get_article_by_doi(self, doi: str):
+        """
+        Get a scientific article by DOI.
+
+        Args:
+            doi (str): The DOI of the article.
+
+        Returns:
+            dict: The article document if found, None otherwise.
+        """
+        query = """
+        FOR doc IN sci_articles
+            FILTER doc.metadata.doi == @doi
+            RETURN doc
+        """
+        cursor = self.db.aql.execute(query, bind_vars={"doi": doi})
+        try:
+            return next(cursor)
+        except StopIteration:
+            return None
+
+    def get_document_text(
+        self, _id: str = None, _key: str = None, collection: str = None
+    ):
+        """
+        Get the text content of a document. If _key is used, collection must be provided.
+        * Use base_arango for sci_articles and user_arango for other collections. *
+        Args:
+            _id (str, optional): The ID of the document. Defaults to None.
+            _key (str, optional): The key of the document. Defaults to None.
+            collection (str, optional): The name of the collection. Defaults to None.
+        Returns:
+            str: The text content of the document, or None if not found.
+        """
+        if collection == "sci_articles" or _id.startswith("sci_articles"):
+            assert (
+                self.db_name == "base"
+            ), "If requesting sci_articles base_arango must be used"
+        else:
+            assert (
+                self.db_name != "base"
+            ), "If not requesting sci_articles user_arango must be used"
+
+        try:
+            if _id:
+                doc = self.db.document(_id)
+            elif _key:
+                assert (
+                    collection is not None
+                ), "Collection name must be provided if _key is used"
+                doc = self.db.collection(collection).get(_key)
+
+            text = [chunk.get("text") for chunk in doc.get("chunks", [])]
+        except Exception as e:
+            print(f"Error retrieving text for document {_id or _key}: {e}")
+            return None
+        return "\n".join(text) if text else None
+
+    def store_article_chunks(
+        self, article_data: dict, chunks: list, document_key: str = None
+    ):
+        """
+        Store article chunks in the database.
+
+        Args:
+            article_data (dict): The article metadata.
+            chunks (list): The chunks of text from the article.
+            document_key (str, optional): The key to use for the document. Defaults to None.
+
+        Returns:
+            tuple: (document_id, database_name, document_doi)
        """
-        Sanitize a given key by replacing all characters that are not alphanumeric, 
-        underscore, hyphen, dot, at symbol, parentheses, plus, equals, semicolon, 
-        dollar sign, asterisk, single quote, percent, or colon with an underscore.
+        collection = "sci_articles"
+
+        arango_chunks = []
+        for index, chunk in enumerate(chunks):
+            chunk_id = f"{document_key}_{index}" if document_key else f"chunk_{index}"
+            page_numbers = chunk.get("pages", [])
+            text = chunk.get("text", "")
+            arango_chunks.append({"text": text, "pages": page_numbers, "id": chunk_id})
+
+        arango_document = {
+            "_key": document_key,
+            "chunks": arango_chunks,
+            "metadata": article_data.get("metadata", {}),
+        }
+
+        if article_data.get("summary"):
+            arango_document["summary"] = article_data.get("summary")
+
+        if article_data.get("doi"):
+            arango_document["crossref"] = True
+
+        doc = self.insert_document(
+            collection_name=collection,
+            document=arango_document,
+            overwrite=True,
+            overwrite_mode="update",
+            keep_none=False,
+        )
+
+        return doc["_id"], self.db_name, article_data.get("doi")
+
+    def add_article_to_collection(self, article_id: str, collection_name: str):
+        """
+        Add an article to a user's article collection.

        Args:
-            _key (str): The key to be sanitized.
+            article_id (str): The ID of the article.
+            collection_name (str): The name of the user's collection.

        Returns:
-            str: The sanitized key with disallowed characters replaced by underscores.
+            bool: True if the article was added successfully.
+        """
+        query = """
+        FOR collection IN article_collections
+            FILTER collection.name == @collection_name
+            UPDATE collection WITH {
+                articles: PUSH(collection.articles, @article_id)
+            } IN article_collections
+            RETURN NEW
+        """
+        cursor = self.db.aql.execute(
+            query,
+            bind_vars={"collection_name": collection_name, "article_id": article_id},
+        )
+        try:
+            return next(cursor) is not None
+        except StopIteration:
+            return False
+
+    def remove_article_from_collection(self, article_id: str, collection_name: str):
+        """
+        Remove an article from a user's article collection.
+
+        Args:
+            article_id (str): The ID of the article.
+            collection_name (str): The name of the user's collection.
+
+        Returns:
+            bool: True if the article was removed successfully.
+        """
+        query = """
+        FOR collection IN article_collections
+            FILTER collection.name == @collection_name
+            UPDATE collection WITH {
+                articles: REMOVE_VALUE(collection.articles, @article_id)
+            } IN article_collections
+            RETURN NEW
+        """
+        cursor = self.db.aql.execute(
+            query,
+            bind_vars={"collection_name": collection_name, "article_id": article_id},
+        )
+        try:
+            return next(cursor) is not None
+        except StopIteration:
+            return False
+
+    # Projects
+    def get_projects(self, username: str = None):
        """
+        Get all projects for a user.
+
+        Returns:
+            list: A list of project documents.
+        """
+        if username:
+            query = """
+            FOR p IN projects
+                SORT p.name ASC
+                RETURN p
+            """
+            return list(self.db.aql.execute(query))
+        else:
+            return self.get_all_documents("projects")
+
+    def get_project(self, project_name: str, username: str = None):
+        """
+        Get a project by name.
+
+        Args:
+            project_name (str): The name of the project.
+
+        Returns:
+            dict: The project document if found, None otherwise.
+        """
+        if username:
+            query = """
+            FOR p IN projects
+                FILTER p.name == @project_name
+                RETURN p
+            """
+            cursor = self.db.aql.execute(
+                query, bind_vars={"project_name": project_name}
+            )
+            try:
+                return next(cursor)
+            except StopIteration:
+                return None
+        else:
+            query = """
+            FOR p IN projects
+                FILTER p.name == @project_name
+                RETURN p
+            """
+            cursor = self.db.aql.execute(
+                query, bind_vars={"project_name": project_name}
+            )
+            try:
+                return next(cursor)
+            except StopIteration:
+                return None
+
+    def create_project(self, project_data: dict):
+        """
+        Create a new project.
+
+        Args:
+            project_data (dict): The project data.
+
+        Returns:
+            dict: The created project document.
+        """
+        return self.insert_document("projects", project_data)
+
+    def update_project(self, project_data: dict):
+        """
+        Update an existing project.
+
+        Args:
+            project_data (dict): The project data.
+
+        Returns:
+            dict: The updated project document.
+        """
+        return self.update_document(project_data, check_rev=False)
+
+    def delete_project(self, project_name: str, username: str = None):
+        """
+        Delete a project.
+
+        Args:
+            project_name (str): The name of the project.
+            username (str, optional): The username. Defaults to None.
+
+        Returns:
+            bool: True if the project was deleted successfully.
+        """
+        filters = {"name": project_name}
+        if username:
+            filters["username"] = username
+
+        return self.delete_document_by_match("projects", filters)
+
+    def get_project_notes(self, project_name: str, username: str = None):
+        """
+        Get notes for a project.
+
+        Args:
+            project_name (str): The name of the project.
+            username (str, optional): The username. Defaults to None.
+
+        Returns:
+            list: A list of note documents.
+        """
+        query = """
+        FOR note IN notes
+            FILTER note.project == @project_name
+        """
+
+        if username:
+            query += " AND note.username == @username"
+
+        query += """
+            SORT note.timestamp DESC
+            RETURN note
+        """
+
+        bind_vars = {"project_name": project_name}
+        if username:
+            bind_vars["username"] = username
+
+        return list(self.db.aql.execute(query, bind_vars=bind_vars))
+
+    def add_note_to_project(self, note_data: dict):
+        """
+        Add a note to a project.
+
+        Args:
+            note_data (dict): The note data.
+
+        Returns:
+            dict: The created note document.
+        """
+        return self.insert_document("notes", note_data)
+
+    def fetch_notes_tool(
+        self, project_name: str, username: str = None
+    ) -> UnifiedSearchResults:
+        """
+        Fetch notes for a project and return them in a unified format.
+
+        Args:
+            project_name (str): The name of the project.
+            username (str, optional): The username. Defaults to None.
+
+        Returns:
+            UnifiedSearchResults: A unified representation of the notes.
+        """
+        notes = self.get_project_notes(project_name, username)
+        chunks = []
+        source_ids = []
+
+        for note in notes:
+            chunk = UnifiedDataChunk(
+                content=note.get("content", ""),
+                metadata={
+                    "title": note.get("title", "No title"),
+                    "timestamp": note.get("timestamp", ""),
+                },
+                source_type="note",
+            )
+            chunks.append(chunk)
+            source_ids.append(note.get("_id", "unknown_id"))
+
+        return UnifiedSearchResults(chunks=chunks, source_ids=source_ids)
+
+    # Chat operations
+    def get_chat(self, chat_key: str):
+        """
+        Get a chat by key.
+
+        Args:
+            chat_key (str): The key of the chat.
+
+        Returns:
+            dict: The chat document if found, None otherwise.
+        """
+        try:
+            return self.db.collection("chats").get(chat_key)
+        except:
+            return None
+
+    def create_or_update_chat(self, chat_data: dict):
+        """
+        Create or update a chat.
+
+        Args:
+            chat_data (dict): The chat data.
+
+        Returns:
+            dict: The created or updated chat document.
+        """
+        return self.insert_document("chats", chat_data, overwrite=True)
+
+    def get_chats_for_project(self, project_name: str, username: str = None):
+        """
+        Get all chats for a project.
+
+        Args:
+            project_name (str): The name of the project.
+            username (str, optional): The username. Defaults to None.
+
+        Returns:
+            list: A list of chat documents.
+        """
+        query = """
+        FOR chat IN chats
+            FILTER chat.project == @project_name
+        """
+
+        if username:
+            query += " AND chat.username == @username"
+
+        query += """
+            SORT chat.timestamp DESC
+            RETURN chat
+        """
+
+        bind_vars = {"project_name": project_name}
+        if username:
+            bind_vars["username"] = username
+
+        return list(self.db.aql.execute(query, bind_vars=bind_vars))
+
+    def delete_chat(self, chat_key: str):
+        """
+        Delete a chat.
+
+        Args:
+            chat_key (str): The key of the chat.
+
+        Returns:
+            dict: The deletion result.
+        """
+        return self.delete_document("chats", chat_key)
+
+    def delete_old_chats(self, days: int = 30):
+        """
+        Delete chats older than a certain number of days.
+
+        Args:
+            days (int, optional): The number of days. Defaults to 30.
+
+        Returns:
+            int: The number of deleted chats.
+        """
+        query = """
+        FOR chat IN chats
+            FILTER DATE_DIFF(chat.timestamp, DATE_NOW(), "d") > @days
+            REMOVE chat IN chats
+            RETURN OLD
+        """
+        cursor = self.db.aql.execute(query, bind_vars={"days": days})
+        return len(list(cursor))
+
+    # Settings operations
+    def get_settings(self):
+        """
+        Get settings document.
+
+        Returns:
+            dict: The settings document if found, None otherwise.
+        """
+        try:
+            return self.db.document("settings/settings")
+        except:
+            return None
+
+    def initialize_settings(self, settings_data: dict):
+        """
+        Initialize settings.
+
+        Args:
+            settings_data (dict): The settings data.
+
+        Returns:
+            dict: The created settings document.
+        """
+        settings_data["_key"] = "settings"
+        return self.insert_document("settings", settings_data)
+
+    def update_settings(self, settings_data: dict):
+        """
+        Update settings.
+
+        Args:
+            settings_data (dict): The settings data.
+
+        Returns:
+            dict: The updated settings document.
+        """
+        return self.update_document_by_match(
+            collection_name="settings", filters={"_key": "settings"}, body=settings_data
+        )
+
+    def get_document_metadata(self, document_id: str) -> dict:
+        """
+        Retrieve document metadata with merged user notes if available.
+
+        This method determines the appropriate database based on the document ID,
+        retrieves the document, and enriches its metadata with any user notes.
+
+        Args:
+            document_id (str): The document ID to retrieve metadata for
+
+        Returns:
+            dict: The document metadata dictionary, or empty dict if not found
+        """
+        if not document_id:
+            return {}
+
+        try:
+            # Determine which database to use based on document ID prefix
+            if document_id.startswith("sci_articles"):
+                # Science articles are in the base database
+                db_to_use = self.client.db(
+                    "base",
+                    username=os.getenv("ARANGO_USER"),
+                    password=os.getenv("ARANGO_PASSWORD"),
+                )
+                arango_doc = db_to_use.document(document_id)
+            else:
+                # User documents are in the user's database
+                arango_doc = self.db.document(document_id)
+
+            if not arango_doc:
+                return {}
+
+            # Get metadata and merge user notes if available
+            arango_metadata = arango_doc.get("metadata", {})
+            if "user_notes" in arango_doc:
+                arango_metadata["user_notes"] = arango_doc["user_notes"]
+
+            return arango_metadata
+        except Exception as e:
+            print(f"Error retrieving metadata for document {document_id}: {e}")
+            return {}
+
+    def summarise_chunks(self, document: dict, is_sci=False):
+        from _llm import LLM
+        from models import ArticleChunk
+
+        assert "_id" in document, "Document must have an _id field"
+
+        if is_sci:
+            system_message = """You are a science assistant summarizing scientific articles.
+            You will get an article chunk by chunk, and you have three tasks for each chunk:
+            1. Summarize the content of the chunk.
+            2. Tag the chunk with relevant tags.
+            3. Extract the scientific references from the chunk.
+            """
+        else:
+            system_message = """You are a general assistant summarizing articles.
+            You will get an article chunk by chunk, and you have two tasks for each chunk:
+            1. Summarize the content of the chunk.
+            2. Tag the chunk with relevant tags.
+            """
+
+        system_message += """\nPlease make use of the previous chunks you have already seen to understand the current chunk in context and make the summary stand for itself. But remember, *it is the current chunk you are summarizing*
+            ONLY use the information in the chunks to make the summary, and do not add any information that is not in the chunks."""
+
+        llm = LLM(system_message=system_message)
+        chunks = []
+        for chunk in document["chunks"]:
+            if "summary" in chunk:
+                chunks.append(chunk)
+                continue
+            prompt = f"""Summarize the following text to make it stand on its own:\n
+            '''
+            {chunk['text']}
+            '''\n
+            Your tasks are:
+            1. Summarize the content of the chunk. Make sure to include all relevant details!
+            2. Tag the chunk with relevant tags.
+            """
+            if is_sci:
+                prompt += "\n3. Extract the scientific references mentioned in this specific chunk. If there is a DOI reference, include that in the reference. Sometimes the reference is only a number in brackets, like [1], so make sure to include that as well (in brackets)."
+            prompt += "\nONLY use the information in the chunks to make the summary, and do not add any information that is not in the chunks."

-        return re.sub(r"[^A-Za-z0-9_\-\.@()+=;$!*\'%:]", "_", _key)
+            try:
+                response = llm.generate(prompt, format=ArticleChunk.model_json_schema())
+                structured_response = ArticleChunk.model_validate_json(response.content)
+                chunk["summary"] = structured_response.summary
+                chunk["tags"] = [i.lower() for i in structured_response.tags]
+                chunk["summary_meta"] = {
+                    "model": llm.model,
+                    "date": datetime.now().strftime("%Y-%m-%d"),
+                }
+            except Exception as e:
+                print(f"Error processing chunk: {e}")
+            chunks.append(chunk)
+        document["chunks"] = chunks
+        self.update_document(document, check_rev=False)


 if __name__ == "__main__":
+    arango = ArangoDB(user='lasse')
+    random_doc = arango.db.aql.execute(
+        "FOR doc IN other_documents LIMIT 1 RETURN doc"
+    )
+    print(next(random_doc))

-    arango = ArangoDB(db_name='base')
-    articles = arango.db.collection('sci_articles').all()
-    for article in articles:
-        if 'metadata' in article and article['metadata']:
-            if 'abstract' in article['metadata']:
-                abstract = article['metadata']['abstract']
-                if isinstance(abstract, str):
-                    # Remove text within <> brackets and the brackets themselves
-                    article['metadata']['abstract'] = re.sub(r'<[^>]*>', '', abstract)
-                    arango.db.collection('sci_articles').update_match(
-                        filters={'_key': article['_key']},
-                        body={'metadata': article['metadata']},
-                        merge=True
-                    )
-                    print(f"Updated abstract for {article['_key']}")
-
-    
--- a/_base_class.py
+++ b/_base_class.py
@ -5,17 +5,17 @@ import streamlit as st
 from _arango import ArangoDB
 from _chromadb import ChromaDB

+
 class BaseClass:
    def __init__(self, username: str, **kwargs) -> None:
        self.username: str = username
-        self.project_name: str = kwargs.get('project_name', None)
-        self.collection: str = kwargs.get('collection_name', None)
+        self.project_name: str = kwargs.get("project_name", None)
+        self.collection: str = kwargs.get("collection_name", None)
        self.user_arango: ArangoDB = self.get_arango()
        self.base_arango: ArangoDB = self.get_arango(admin=True)
        for key, value in kwargs.items():
            setattr(self, key, value)

-
    def get_arango(self, admin: bool = False, db_name: str = None) -> ArangoDB:
        if db_name:
            return ArangoDB(db_name=db_name)
@ -25,29 +25,41 @@ class BaseClass:
            return ArangoDB(user=self.username, db_name=self.username)

    def get_article_collections(self) -> list:
-        article_collections = self.user_arango.db.aql.execute(
+        """
+        Gets the names of all article collections for the current user.
+
+        Returns:
+            list: A list of article collection names.
+        """
+        article_collections = self.user_arango.execute_aql(
            'FOR doc IN article_collections RETURN doc["name"]'
        )
        return list(article_collections)

    def get_projects(self) -> list:
-        projects = self.user_arango.db.aql.execute(
-            'FOR doc IN projects RETURN doc["name"]'
-        )
-        return list(projects)
+        """
+        Gets the names of all projects for the current user.

+        Returns:
+            list: A list of project names.
+        """
+        projects = self.user_arango.get_projects(username=self.username)
+        return [project["name"] for project in projects]

    def get_chromadb(self):
        return ChromaDB()

    def get_project(self, project_name: str):
-        doc = self.user_arango.db.aql.execute(
-            f'FOR doc IN projects FILTER doc["name"] == "{project_name}" RETURN doc',
-            count=True,
-        )
-        if doc:
-            return doc.next()
+        """
+        Get a project by name for the current user.
+
+        Args:
+            project_name (str): The name of the project.

+        Returns:
+            dict: The project document if found, None otherwise.
+        """
+        return self.user_arango.get_project(project_name, username=self.username)

    def set_filename(self, filename=None, folder="other_documents"):
        """
@ -77,6 +89,12 @@ class BaseClass:
            self.file_path = file_path + ".pdf"
        return file_path

+    def remove_thinking(self, response):
+        """Remove the thinking section from the response"""
+        response_text = response.content if hasattr(response, "content") else str(response)
+        if "</think>" in response_text:
+            return response_text.split("</think>")[1].strip()
+        return response_text

 class StreamlitBaseClass(BaseClass):
    """
@ -98,10 +116,11 @@ class StreamlitBaseClass(BaseClass):
            Displays a select box for choosing a collection of favorite articles. Updates the current collection in the session state and the database.
        choose_project(text="Select a project") -> str:
            Displays a select box for choosing a project. Updates the current project in the session state and the database.
-        """
+    """
+
    def __init__(self, username: str, **kwargs) -> None:
        super().__init__(username, **kwargs)
-    
+
    def get_settings(self, field: str = None):
        """
        Retrieve or initialize user settings from the database.
@ -112,24 +131,31 @@ class StreamlitBaseClass(BaseClass):
        are then stored in the Streamlit session state.

        Args:
-            field (str, optional): The specific field to retrieve from the settings. 
+            field (str, optional): The specific field to retrieve from the settings.
                                   If not provided, the entire settings document is returned.

        Returns:
-            dict or any: The entire settings document if no field is specified, 
+            dict or any: The entire settings document if no field is specified,
                         otherwise the value of the specified field.
        """
-        settings = self.user_arango.db.document("settings/settings")
+        settings = self.user_arango.get_settings()
        if not settings:
-            self.user_arango.db.collection("settings").insert(
-                {"_key": "settings", "current_collection": None, "current_page": None}
-            )
+            default_settings = {
+                "_key": "settings",
+                "current_collection": None,
+                "current_page": None,
+            }
+            self.user_arango.initialize_settings(default_settings)
+            settings = default_settings
+
+        # Ensure required fields exist
        for i in ["current_collection", "current_page"]:
            if i not in settings:
                settings[i] = None
+
        st.session_state["settings"] = settings
        if field:
-            return settings[field]
+            return settings.get(field)
        return settings

    def update_settings(self, key, value) -> None:
@ -189,7 +215,6 @@ class StreamlitBaseClass(BaseClass):
            st.session_state["current_page"] = page_name
            self.update_settings("current_page", page_name)

-    
    def choose_collection(self, text="Select a collection of favorite articles") -> str:
        """
        Prompts the user to select a collection of favorite articles from a list.
@ -214,7 +239,7 @@ class StreamlitBaseClass(BaseClass):
            self.update_settings("current_collection", collection)
            self.update_session_state()
            return collection
-    
+
    def choose_project(self, text="Select a project") -> str:
        """
        Prompts the user to select a project from a list of available projects.
@ -231,16 +256,188 @@ class StreamlitBaseClass(BaseClass):
            - Prints the chosen project name to the console.
        """
        projects = self.get_projects()
-        print('projects', projects)
+        print("projects", projects)
        print(self.project_name)
-        
-        project = st.selectbox(text, projects, index=projects.index(self.project_name) if self.project_name in projects else None)
-        print('Choosing project...')
+
+        project = st.selectbox(
+            text,
+            projects,
+            index=(
+                projects.index(self.project_name)
+                if self.project_name in projects
+                else None
+            ),
+        )
+        print("Choosing project...")
        if project:
            from projects_page import Project
+
            self.project = Project(self.username, project, self.user_arango)
            self.collection = None
            self.update_settings("current_project", self.project.name)
            self.update_session_state()
-            print('CHOOSEN PROJECT:', self.project.name)
+            print("CHOOSEN PROJECT:", self.project.name)
            return self.project
+
+    def add_article_to_collection(self, article_id: str, collection_name: str = None):
+        """
+        Add an article to a user's collection.
+
+        Args:
+            article_id (str): The ID of the article.
+            collection_name (str, optional): The name of the collection. Defaults to current collection.
+
+        Returns:
+            bool: True if the article was added successfully.
+        """
+        if collection_name is None:
+            collection_name = self.collection
+
+        return self.user_arango.add_article_to_collection(article_id, collection_name)
+
+    def remove_article_from_collection(
+        self, article_id: str, collection_name: str = None
+    ):
+        """
+        Remove an article from a user's collection.
+
+        Args:
+            article_id (str): The ID of the article.
+            collection_name (str, optional): The name of the collection. Defaults to current collection.
+
+        Returns:
+            bool: True if the article was removed successfully.
+        """
+        if collection_name is None:
+            collection_name = self.collection
+
+        return self.user_arango.remove_article_from_collection(
+            article_id, collection_name
+        )
+
+    def get_project_notes(self, project_name: str = None):
+        """
+        Get notes for a project.
+
+        Args:
+            project_name (str, optional): The name of the project. Defaults to current project.
+
+        Returns:
+            list: A list of note documents.
+        """
+        if project_name is None:
+            project_name = self.project_name
+
+        return self.user_arango.get_project_notes(project_name, username=self.username)
+
+    def add_note_to_project(self, note_data: dict):
+        """
+        Add a note to a project.
+
+        Args:
+            note_data (dict): The note data. Should contain project, username, and timestamp.
+
+        Returns:
+            dict: The created note document.
+        """
+        if "project" not in note_data:
+            note_data["project"] = self.project_name
+        if "username" not in note_data:
+            note_data["username"] = self.username
+
+        return self.user_arango.add_note_to_project(note_data)
+
+    def create_project(self, project_data: dict):
+        """
+        Create a new project for the current user.
+
+        Args:
+            project_data (dict): The project data. Should include a name field.
+
+        Returns:
+            dict: The created project document.
+        """
+        if "username" not in project_data:
+            project_data["username"] = self.username
+
+        return self.user_arango.create_project(project_data)
+
+    def update_project(self, project_data: dict):
+        """
+        Update an existing project.
+
+        Args:
+            project_data (dict): The project data. Must include _key.
+
+        Returns:
+            dict: The updated project document.
+        """
+        return self.user_arango.update_project(project_data)
+
+    def delete_project(self, project_name: str):
+        """
+        Delete a project for the current user.
+
+        Args:
+            project_name (str): The name of the project.
+
+        Returns:
+            bool: True if the project was deleted successfully.
+        """
+        return self.user_arango.delete_project(project_name, username=self.username)
+
+    def get_chat(self, chat_key: str):
+        """
+        Get a chat by key.
+
+        Args:
+            chat_key (str): The key of the chat.
+
+        Returns:
+            dict: The chat document if found, None otherwise.
+        """
+        return self.user_arango.get_chat(chat_key)
+
+    def create_or_update_chat(self, chat_data: dict):
+        """
+        Create or update a chat.
+
+        Args:
+            chat_data (dict): The chat data.
+
+        Returns:
+            dict: The created or updated chat document.
+        """
+        if "username" not in chat_data:
+            chat_data["username"] = self.username
+
+        return self.user_arango.create_or_update_chat(chat_data)
+
+    def get_chats_for_project(self, project_name: str = None):
+        """
+        Get all chats for a project.
+
+        Args:
+            project_name (str, optional): The name of the project. Defaults to current project.
+
+        Returns:
+            list: A list of chat documents.
+        """
+        if project_name is None:
+            project_name = self.project_name
+
+        return self.user_arango.get_chats_for_project(
+            project_name, username=self.username
+        )
+
+    def delete_chat(self, chat_key: str):
+        """
+        Delete a chat.
+
+        Args:
+            chat_key (str): The key of the chat.
+
+        Returns:
+            dict: The deletion result.
+        """
+        return self.user_arango.delete_chat(chat_key)
--- a/_bots.py
+++ b/_bots.py
@ -1,800 +0,0 @@
-from datetime import datetime
-import streamlit as st
-from _base_class import StreamlitBaseClass, BaseClass
-from _llm import LLM
-from prompts import *
-from colorprinter.print_color import *
-from llm_tools import ToolRegistry
-
-class Chat(StreamlitBaseClass):
-    def __init__(self, username=None, **kwargs):
-        super().__init__(username=username, **kwargs)
-        self.name = kwargs.get("name", None)
-        self.chat_history = kwargs.get("chat_history", [])
-
-
-    def add_message(self, role, content):
-        self.chat_history.append(
-            {
-                "role": role,
-                "content": content.strip().strip('"'),
-                "role_type": self.role,
-            }
-        )
-
-    def to_dict(self):
-        return {
-            "_key": self._key,
-            "name": self.name,
-            "chat_history": self.chat_history,
-            "role": self.role,
-            "username": self.username,
-        }
-
-    def update_in_arango(self):
-        self.last_updated = datetime.now().isoformat()
-        self.user_arango.db.collection("chats").insert(
-            self.to_dict(), overwrite=True, overwrite_mode="update"
-        )
-
-    def set_name(self, user_input):
-        llm = LLM(
-            model="small",
-            max_length_answer=50,
-            temperature=0.4,
-            system_message="You are a chatbot who will be chatting with a user",
-        )
-        prompt = (
-            f'Give a short name to the chat based on this user input: "{user_input}" '
-            "No more than 30 characters. Answer ONLY with the name of the chat."
-        )
-        name = llm.generate(prompt).content.strip('"')
-        name = f'{name} - {datetime.now().strftime("%B %d")}'
-        existing_chat = self.user_arango.db.aql.execute(
-            f'FOR doc IN chats FILTER doc.name == "{name}" RETURN doc', count=True
-        )
-        if existing_chat.count() > 0:
-            name = f'{name} ({datetime.now().strftime("%H:%M")})'
-        name += f" - [{self.role}]"
-        self.name = name
-        return name
-
-    @classmethod
-    def from_dict(cls, data):
-        return cls(
-            username=data.get("username"),
-            name=data.get("name"),
-            chat_history=data.get("chat_history", []),
-            role=data.get("role", "Research Assistant"),
-            _key=data.get("_key"),
-        )
-
-    def chat_history2bot(self, n_messages: int = None, remove_system: bool = False):
-        history = [
-            {"role": m["role"], "content": m["content"]} for m in self.chat_history
-        ]
-        if n_messages and len(history) > n_messages:
-            history = history[-n_messages:]
-            if (
-                all([history[0]["role"] == "system", remove_system])
-                or history[0]["role"] == "assistant"
-            ):
-                history = history[1:]
-        return history
-
-    
-class Bot(BaseClass):
-    def __init__(self, username: str, chat: Chat = None, tools: list = None, **kwargs):
-        super().__init__(username=username, **kwargs)
-
-        # Use the passed in chat or create a new Chat
-        self.chat = chat if chat else Chat(username=username, role="Research Assistant")
-        print_yellow(f"Chat:", chat, type(chat))
-        # Store or set up project/collection if available
-        self.project = kwargs.get("project", None)
-        self.collection = kwargs.get("collection", None)
-        if self.collection and not isinstance(self.collection, list):
-            self.collection = [self.collection]
-
-        # Load articles in the collections
-        self.arango_ids = []
-        if self.collection:
-            for c in self.collection:
-                for _id in self.user_arango.db.aql.execute(
-                    """
-                    FOR doc IN article_collections
-                    FILTER doc.name == @collection
-                    FOR article IN doc.articles
-                        RETURN article._id
-                    """,
-                    bind_vars={"collection": c},
-                ):
-                    self.arango_ids.append(_id)
-
-        # A standard LLM for normal chat
-        self.chatbot = LLM(messages=self.chat.chat_history2bot())
-        # A helper bot for generating queries or short prompts
-        self.helperbot = LLM(
-            temperature=0,
-            model="small",
-            max_length_answer=500,
-            system_message=get_query_builder_system_message(),
-            messages=self.chat.chat_history2bot(n_messages=4, remove_system=True),
-        )
-        # A specialized LLM picking which tool to use
-        self.toolbot = LLM(
-            temperature=0,
-            system_message="""
-            You are an assistant bot helping an answering bot to answer a user's messages. 
-            Your task is to choose one or multiple tools that will help the answering bot to provide the user with the best possible answer.
-            You should NEVER directly answer the user. You MUST choose a tool.
-            """,
-            chat=False,
-            model="small",
-        )
-
-        # Load or register the passed-in tools
-        if tools:
-            self.tools = ToolRegistry.get_tools(tools=tools)
-        else:
-            self.tools = ToolRegistry.get_tools()
-
-        # Store other kwargs
-        for arg in kwargs:
-            setattr(self, arg, kwargs[arg])
-
-
-
-    
-    def get_chunks(
-        self,
-        user_input,
-        collections=["sci_articles", "other_documents"],
-        n_results=7,
-        n_sources=4,
-        filter=True,
-    ):
-        # Basic version without Streamlit calls
-        query = self.helperbot.generate(
-            get_generate_vector_query_prompt(user_input, self.chat.role)
-        ).content.strip('"')
-
-        combined_chunks = []
-        if collections:
-            for collection in collections:
-                where_filter = {"_id": {"$in": self.arango_ids}} if filter else {}
-                chunks = self.get_chromadb().query(
-                    query=query,
-                    collection=collection,
-                    n_results=n_results,
-                    n_sources=n_sources,
-                    where=where_filter,
-                    max_retries=3,
-                )
-                for doc, meta, dist in zip(
-                    chunks["documents"][0],
-                    chunks["metadatas"][0],
-                    chunks["distances"][0],
-                ):
-                    combined_chunks.append(
-                        {"document": doc, "metadata": meta, "distance": dist}
-                    )
-        combined_chunks.sort(key=lambda x: x["distance"])
-
-        # Keep the best chunks according to n_sources
-        sources = set()
-        closest_chunks = []
-        for chunk in combined_chunks:
-            source_id = chunk["metadata"].get("_id", "no_id")
-            if source_id not in sources:
-                sources.add(source_id)
-                closest_chunks.append(chunk)
-            if len(sources) >= n_sources:
-                break
-        if len(closest_chunks) < n_results:
-            remaining_chunks = [
-                c for c in combined_chunks if c not in closest_chunks
-            ]
-            closest_chunks.extend(remaining_chunks[: n_results - len(closest_chunks)])
-
-        # Now fetch real metadata from Arango
-        for chunk in closest_chunks:
-            _id = chunk["metadata"].get("_id")
-            if not _id:
-                continue
-            if _id.startswith("sci_articles"):
-                arango_doc = self.base_arango.db.document(_id)
-            else:
-                arango_doc = self.user_arango.db.document(_id)
-            if arango_doc:
-                arango_metadata = arango_doc.get("metadata", {})
-                # Possibly merge notes
-                if "user_notes" in arango_doc:
-                    arango_metadata["user_notes"] = arango_doc["user_notes"]
-                chunk["metadata"] = arango_metadata
-
-        # Group by article title
-        grouped_chunks = {}
-        article_number = 1
-        for chunk in closest_chunks:
-            title = chunk["metadata"].get("title", "No title")
-            chunk["article_number"] = article_number
-            if title not in grouped_chunks:
-                grouped_chunks[title] = {
-                    "article_number": article_number,
-                    "chunks": [],
-                }
-                article_number += 1
-            grouped_chunks[title]["chunks"].append(chunk)
-        return grouped_chunks
-
-    def answer_tool_call(self, response, user_input):
-        bot_responses = []
-        # This method returns / stores responses (no Streamlit calls)
-        if not response.get("tool_calls"):
-            return ""
-
-        for tool in response.get("tool_calls"):
-            function_name = tool.function.get('name')
-            arguments = tool.function.arguments
-            arguments["query"] = user_input
-
-            if hasattr(self, function_name):
-                if function_name in [
-                    "fetch_other_documents_tool",
-                    "fetch_science_articles_tool",
-                    "fetch_science_articles_and_other_documents_tool",
-                ]:
-                    chunks = getattr(self, function_name)(**arguments)
-                    bot_responses.append(
-                        self.generate_from_chunks(user_input, chunks).strip('"')
-                    )
-                elif function_name == "fetch_notes_tool":
-                    notes = getattr(self, function_name)()
-                    bot_responses.append(
-                        self.generate_from_notes(user_input, notes).strip('"')
-                    )
-                elif function_name == "conversational_response_tool":
-                    bot_responses.append(
-                        getattr(self, function_name)(user_input).strip('"')
-                    )
-        return "\n\n".join(bot_responses)
-
-    def process_user_input(self, user_input, content_attachment=None):
-        # Add user message
-        self.chat.add_message("user", user_input)
-
-        if not content_attachment:
-            prompt = get_tools_prompt(user_input)
-            response = self.toolbot.generate(prompt, tools=self.tools, stream=False)
-            if response.get("tool_calls"):
-                bot_response = self.answer_tool_call(response, user_input)
-            else:
-                # Just respond directly
-                bot_response = response.content.strip('"')
-        else:
-            # If there's an attachment, do something minimal
-            bot_response = "Content attachment received (Base Bot)."
-
-        # Add assistant message
-        if self.chat.chat_history[-1]["role"] != "assistant":
-            self.chat.add_message("assistant", bot_response)
-
-        # Update in Arango
-        self.chat.update_in_arango()
-        return bot_response
-
-    def generate_from_notes(self, user_input, notes):
-        # No Streamlit calls
-        notes_string = ""
-        for note in notes:
-            notes_string += f"\n# {note.get('title','No title')}\n{note.get('content','')}\n---\n"
-        prompt = get_chat_prompt(user_input, content_string=notes_string, role=self.chat.role)
-        return self.chatbot.generate(prompt, stream=True)
-
-    def generate_from_chunks(self, user_input, chunks):
-        # No Streamlit calls
-        chunks_string = ""
-        for title, group in chunks.items():
-            user_notes_string = ""
-            if "user_notes" in group["chunks"][0]["metadata"]:
-                notes = group["chunks"][0]["metadata"]["user_notes"]
-                user_notes_string = f'\n\nUser notes:\n"""\n{notes}\n"""\n\n'
-            docs = "\n(...)\n".join([c["document"] for c in group["chunks"]])
-            chunks_string += (
-                f"\n# {title}\n## Article #{group['article_number']}\n{user_notes_string}{docs}\n---\n"
-            )
-        prompt = get_chat_prompt(user_input, content_string=chunks_string, role=self.chat.role)
-        return self.chatbot.generate(prompt, stream=True)
-
-    def run(self):
-        # Base Bot has no Streamlit run loop
-        pass
-
-    def get_notes(self):
-        # Minimal note retrieval
-        notes = self.user_arango.db.aql.execute(
-            f'FOR doc IN notes FILTER doc.project == "{self.project.name if self.project else ""}" RETURN doc'
-        )
-        return list(notes)
-
-    @ToolRegistry.register
-    def fetch_science_articles_tool(self, query: str, n_documents: int):
-        """
-        "Fetches information from scientific articles. Use this tool when the user is looking for information from scientific articles."
-
-        Parameters:
-            query (str): The search query to find relevant scientific articles.
-            n_documents (int): How many documents to fetch. A complex query may require more documents. Min: 3, Max: 10.
-
-        Returns:
-            list: A list of chunks containing information from the fetched scientific articles.
-        """
-        print_purple('Query:', query)
-
-        n_documents = int(n_documents)
-        if n_documents < 3:
-            n_documents = 3
-        elif n_documents > 10:
-            n_documents = 10
-        return self.get_chunks(
-            query, collections=["sci_articles"], n_results=n_documents
-        )
-
-    @ToolRegistry.register
-    def fetch_other_documents_tool(self, query: str, n_documents: int):
-        """
-        Fetches information from other documents based on the user's query.
-
-        This method retrieves information from various types of documents such as reports, news articles, and other texts. It should be used only when it is clear that the user is not seeking scientific articles.
-
-        Args:
-            query (str): The search query provided by the user.
-            n_documents (int): How many documents to fetch. A complex query may require more documents. Min: 2, Max: 10.
-
-        Returns:
-            list: A list of document chunks that match the query.
-        """
-        assert isinstance(self, Bot), "The first argument must be a Bot object."
-        n_documents = int(n_documents)
-        if n_documents < 2:
-            n_documents = 2
-        elif n_documents > 10:
-            n_documents = 10
-        return self.get_chunks(
-            query,
-            collections=[f"{self.username}__other_documents"],
-            n_results=n_documents,
-        )
-
-    @ToolRegistry.register
-    def fetch_science_articles_and_other_documents_tool(
-        self, query: str, n_documents: int
-    ):
-        """
-        Fetches information from both scientific articles and other documents.
-
-        This method is often used when the user hasn't specified what kind of sources they are interested in.
-
-        Args:
-            query (str): The search query to fetch information for.
-            n_documents (int): How many documents to fetch. A complex query may require more documents. Min: 3, Max: 10.
-
-        Returns:
-            list: A list of document chunks that match the search query.
-        """
-        assert isinstance(self, Bot), "The first argument must be a Bot object."
-        n_documents = int(n_documents)
-        if n_documents < 3:
-            n_documents = 3
-        elif n_documents > 10:
-            n_documents = 10
-        return self.get_chunks(
-            query,
-            collections=["sci_articles", f"{self.username}__other_documents"],
-            n_results=n_documents,
-        )
-
-    @ToolRegistry.register
-    def fetch_notes_tool(bot):
-        """
-        Fetches information from the project notes when you as an editor need context from the project notes to understand other information. ONLY use this together with other tools! No arguments needed.
-
-        Returns:
-            list: A list of notes.
-        """
-        assert isinstance(bot, Bot), "The first argument must be a Bot object."
-        return bot.get_notes()
-
-    @ToolRegistry.register
-    def conversational_response_tool(self, query: str):
-        """
-        Generate a conversational response to a user's query.
-
-        This method is designed to provide a short and conversational response
-        without fetching additional data. It should be used only when it is clear
-        that the user is engaging in small talk (like saying 'hi') and not seeking detailed information.
-
-        Args:
-            query (str): The user's message to which the bot should respond.
-
-        Returns:
-            str: The generated conversational response.
-        """
-        query = f"""
-        User message: "{query}". 
-        Make your answer short and conversational. 
-        This is perhaps not a conversation about a journalistic project, so try not to be too informative.
-        Don't answer with anything you're not sure of! 
-        """
-
-        result = (
-            self.chatbot.generate(query, stream=True)
-            if self.chatbot
-            else self.llm.generate(query, stream=True)
-        )
-        return result
-
-class StreamlitBot(Bot):
-    def __init__(self, username: str, chat: StreamlitChat = None, tools: list = None, **kwargs):
-        print_purple("StreamlitBot init chat:", chat)
-        super().__init__(username=username, chat=chat, tools=tools, **kwargs)
-
-        # For Streamlit, we can override or add attributes
-        if 'llm_chosen_backend' not in st.session_state:
-            st.session_state['llm_chosen_backend'] = None
-
-        self.chatbot.chosen_backend = st.session_state['llm_chosen_backend']
-        if not st.session_state['llm_chosen_backend']:
-            st.session_state['llm_chosen_backend'] = self.chatbot.chosen_backend
-
-    def run(self):
-        # Example Streamlit run loop
-        self.chat.show_chat_history()
-        if user_input := st.chat_input("Write your message here...", accept_file=True):
-            text_input = user_input.text.replace('"""', "---")
-            if len(user_input.files) > 1:
-                st.error("Please upload only one file at a time.")
-                return
-            attached_file = user_input.files[0] if user_input.files else None
-
-            content_attachment = None
-            if attached_file:
-                if attached_file.type == "application/pdf":
-                    import fitz
-                    pdf_document = fitz.open(stream=attached_file.read(), filetype="pdf")
-                    pdf_text = ""
-                    for page_num in range(len(pdf_document)):
-                        page = pdf_document.load_page(page_num)
-                        pdf_text += page.get_text()
-                    content_attachment = pdf_text
-                elif attached_file.type in ["image/png", "image/jpeg"]:
-                    self.chat.message_attachments = "image"
-                    content_attachment = attached_file.read()
-                    with st.chat_message("user", avatar=self.chat.get_avatar(role="user")):
-                        st.image(content_attachment)
-
-            with st.chat_message("user", avatar=self.chat.get_avatar(role="user")):
-                st.write(text_input)
-
-            if not self.chat.name:
-                self.chat.set_name(text_input)
-                self.chat.last_updated = datetime.now().isoformat()
-                self.chat.saved = False
-                self.user_arango.db.collection("chats").insert(
-                    self.chat.to_dict(), overwrite=True, overwrite_mode="update"
-                )
-
-            self.process_user_input(text_input, content_attachment)
-
-    def process_user_input(self, user_input, content_attachment=None):
-        # We override to show messages in Streamlit instead of just storing
-        self.chat.add_message("user", user_input)
-        if not content_attachment:
-            prompt = get_tools_prompt(user_input)
-            response = self.toolbot.generate(prompt, tools=self.tools, stream=False)
-            if response.get("tool_calls"):
-                bot_response = self.answer_tool_call(response, user_input)
-            else:
-                bot_response = response.content.strip('"')
-                with st.chat_message("assistant", avatar=self.chat.get_avatar(role="assistant")):
-                    st.write(bot_response)
-        else:
-            with st.chat_message("assistant", avatar=self.chat.get_avatar(role="assistant")):
-                with st.spinner("Reading the content..."):
-                    if self.chat.message_attachments == "image":
-                        prompt = get_chat_prompt(user_input, role=self.chat.role, image_attachment=True)
-                        bot_resp = self.chatbot.generate(prompt, stream=False, images=[content_attachment], model="vision")
-                        st.write(bot_resp)
-                        bot_response = bot_resp
-                    else:
-                        prompt = get_chat_prompt(user_input, content_attachment=content_attachment, role=self.chat.role)
-                        response = self.chatbot.generate(prompt, stream=True)
-                        bot_response = st.write_stream(response)
-
-        if self.chat.chat_history[-1]["role"] != "assistant":
-            self.chat.add_message("assistant", bot_response)
-
-        self.chat.update_in_arango()
-
-    def answer_tool_call(self, response, user_input):
-        bot_responses = []
-        for tool in response.get("tool_calls", []):
-            function_name = tool.function.get('name')
-            arguments = tool.function.arguments
-            arguments["query"] = user_input
-
-            with st.chat_message("assistant", avatar=self.chat.get_avatar(role="assistant")):
-                if function_name in [
-                    "fetch_other_documents_tool",
-                    "fetch_science_articles_tool",
-                    "fetch_science_articles_and_other_documents_tool",
-                ]:
-                    chunks = getattr(self, function_name)(**arguments)
-                    response_text = self.generate_from_chunks(user_input, chunks)
-                    bot_response = st.write_stream(response_text).strip('"')
-                    if chunks:
-                        sources = "###### Sources:\n"
-                        for title, group in chunks.items():
-                            j = group["chunks"][0]["metadata"].get("journal", "No Journal")
-                            d = group["chunks"][0]["metadata"].get("published_date", "No Date")
-                            sources += f"[{group['article_number']}] **{title}** :gray[{j} ({d})]\n"
-                        st.markdown(sources)
-                        bot_response += f"\n\n{sources}"
-                    bot_responses.append(bot_response)
-
-                elif function_name == "fetch_notes_tool":
-                    notes = getattr(self, function_name)()
-                    response_text = self.generate_from_notes(user_input, notes)
-                    bot_responses.append(st.write_stream(response_text).strip('"'))
-
-                elif function_name == "conversational_response_tool":
-                    response_text = getattr(self, function_name)(user_input)
-                    bot_responses.append(st.write_stream(response_text).strip('"'))
-
-        return "\n\n".join(bot_responses)
-
-    def generate_from_notes(self, user_input, notes):
-        with st.spinner("Reading project notes..."):
-            return super().generate_from_notes(user_input, notes)
-
-    def generate_from_chunks(self, user_input, chunks):
-        # For reading articles with a spinner
-        magazines = set()
-        for group in chunks.values():
-            j = group["chunks"][0]["metadata"].get("journal", "No Journal")
-            magazines.add(f"*{j}*")
-        s = (
-            f"Reading articles from {', '.join(list(magazines)[:-1])} and {list(magazines)[-1]}..."
-            if len(magazines) > 1
-            else "Reading articles..."
-        )
-        with st.spinner(s):
-            return super().generate_from_chunks(user_input, chunks)
-
-    def sidebar_content(self):
-        with st.sidebar:
-            st.write("---")
-            st.markdown(f'#### {self.chat.name if self.chat.name else ""}')
-            st.button("Delete this chat", on_click=self.delete_chat)
-
-    def delete_chat(self):
-        self.user_arango.db.collection("chats").delete_match(
-            filters={"name": self.chat.name}
-        )
-        self.chat = Chat()
-
-    def get_notes(self):
-        # We can show a spinner or messages too
-        with st.spinner("Fetching notes..."):
-            return super().get_notes()
-        
-
-class EditorBot(StreamlitBot(Bot)):
-    def __init__(self, chat: Chat, username: str, **kwargs):
-        print_blue("EditorBot init chat:", chat)
-        super().__init__(chat=chat, username=username, **kwargs)
-        self.role = "Editor"
-        self.tools = ToolRegistry.get_tools()
-        self.chatbot = LLM(
-            system_message=get_editor_prompt(kwargs.get("project")),
-            messages=self.chat.chat_history2bot(),
-            chosen_backend=kwargs.get("chosen_backend"),
-        )
-
-
-class ResearchAssistantBot(StreamlitBot(Bot)):
-    def __init__(self, chat: Chat, username: str, **kwargs):
-        super().__init__(chat=chat, username=username, **kwargs)
-        self.role = "Research Assistant"
-        self.chatbot = LLM(
-            system_message=get_assistant_prompt(),
-            temperature=0.1,
-            messages=self.chat.chat_history2bot(),
-        )
-        self.tools = [
-                self.fetch_science_articles_tool,
-                self.fetch_science_articles_and_other_documents_tool,
-            ]
-
-
-class PodBot(StreamlitBot(Bot)):
-    """Two LLM agents construct a conversation using material from science articles."""
-
-    def __init__(
-        self,
-        chat: Chat,
-        subject: str,
-        username: str,
-        instructions: str = None,
-        **kwargs,
-    ):
-        super().__init__(chat=chat, username=username, **kwargs)
-        self.subject = subject
-        self.instructions = instructions
-        self.guest_name = kwargs.get("name_guest", "Merit")
-        self.hostbot = HostBot(
-            Chat(username=self.username, role="Host"),
-            subject,
-            username,
-            instructions=instructions,
-            **kwargs,
-        )
-        self.guestbot = GuestBot(
-            Chat(username=self.username, role="Guest"),
-            subject,
-            username,
-            name_guest=self.guest_name,
-            **kwargs,
-        )
-
-    def run(self):
-
-        notes = self.get_notes()
-        notes_string = ""
-        if self.instructions:
-            instructions_string = f'''
-            These are the instructions for the podcast from the producer:
-            """
-            {self.instructions}
-            """
-            '''
-        else:
-            instructions_string = ""
-
-        for note in notes:
-            notes_string += f"\n# {note['title']}\n{note['content']}\n---\n"
-        a = f'''You will make a podcast interview with {self.guest_name}, an expert on "{self.subject}". 
-        {instructions_string}
-        Below are notes on the subject that you can use to ask relevant questions:
-        """
-        {notes_string}
-        """
-        Say hello to the expert and start the interview. Remember to keep the interview to the subject of {self.subject} throughout the conversation.
-        '''
-
-        # Stop button for the podcast
-        with st.sidebar:
-            stop = st.button("Stop podcast", on_click=self.stop_podcast)
-
-        while st.session_state["make_podcast"]:
-            # Stop the podcast if there are more than 14 messages in the chat
-            self.chat.show_chat_history()
-            if len(self.chat.chat_history) == 14:
-                result = self.hostbot.generate(
-                    "The interview has ended. Say thank you to the expert and end the conversation."
-                )
-                self.chat.add_message("Host", result)
-                with st.chat_message(
-                    "assistant", avatar=self.chat.get_avatar(role="assistant")
-                ):
-                    st.write(result.strip('"'))
-                st.stop()
-
-            _q = self.hostbot.toolbot.generate(
-                query=f"{self.guest_name} has answered: {a}. You have to choose a tool to help the host continue the interview.",
-                tools=self.hostbot.tools,
-                temperature=0.6,
-                stream=False,
-            )
-            if "tool_calls" in _q:
-                q = self.hostbot.answer_tool_call(_q, a)
-            else:
-                q = _q
-
-            self.chat.add_message("Host", q)
-
-            _a = self.guestbot.toolbot.generate(
-                f'The podcast host has asked: "{q}" Choose a tool to help the expert answer with relevant facts and information.',
-                tools=self.guestbot.tools,
-            )
-            if "tool_calls" in _a:
-                print_yellow("Tool call response (guest)", _a)
-                print_yellow(self.guestbot.chat.role)
-                a = self.guestbot.answer_tool_call(_a, q)
-            else:
-                a = _a
-            self.chat.add_message("Guest", a)
-
-            self.update_session_state()
-
-    def stop_podcast(self):
-        st.session_state["make_podcast"] = False
-        self.update_session_state()
-        self.chat.show_chat_history()
-
-
-class HostBot(StreamlitBot(Bot)):
-    def __init__(
-        self, chat: Chat, subject: str, username: str, instructions: str, **kwargs
-    ):
-        super().__init__(chat=chat, username=username, **kwargs)
-        self.chat.role = kwargs.get("role", "Host")
-        self.tools = ToolRegistry.get_tools(
-            tools=[
-                self.fetch_notes_tool,
-                self.conversational_response_tool,
-                # "fetch_other_documents", #TODO Should this be included?
-            ]
-        )
-        self.instructions = instructions
-        self.llm = LLM(
-            system_message=f'''
-            You are the host of a podcast and an expert on {subject}. You will ask one question at a time about the subject, and then wait for the guest to answer. 
-            Don't ask the guest to talk about herself/himself, only about the subject.
-            Make your questions short and clear, only if necessary add a brief context to the question.
-            These are the instructions for the podcast from the producer:
-            """
-            {self.instructions}
-            """
-            If the experts' answer is complicated, try to make a very brief summary of it for the audience to understand. You can also ask follow-up questions to clarify the answer, or ask for examples.
-            ''',
-            messages=self.chat.chat_history2bot()
-        )
-        self.toolbot = LLM(
-            temperature=0,
-            system_message="""
-            You are assisting a podcast host in asking questions to an expert. 
-            Choose one or many tools to use in order to assist the host in asking relevant questions. 
-            Often "conversational_response_tool" is enough, but sometimes project notes are needed. 
-            Make sure to read the description of the tools carefully!""",
-            chat=False,
-            model="small",
-        )
-
-    def generate(self, query):
-        return self.llm.generate(query)
-
-
-class GuestBot(StreamlitBot(Bot)):
-    def __init__(self, chat: Chat, subject: str, username: str, **kwargs):
-        super().__init__(chat=chat, username=username, **kwargs)
-        self.chat.role = kwargs.get("role", "Guest")
-        self.tools = ToolRegistry.get_tools(
-            tools=[
-                self.fetch_notes_tool,
-                self.fetch_science_articles_tool,
-            ]
-        )
-
-        self.llm = LLM(
-            system_message=f"""
-            You are {kwargs.get('name', 'Merit')}, an expert on {subject}. 
-            Today you are a guest in a podcast about {subject}. A host will ask you questions about the subject and you will answer by using scientific facts and information.
-            When answering, don't say things like "based on the documents" or alike, as neither the host nor the audience can see the documents. Act just as if you were talking to someone in a conversation.
-            Try to be concise when answering, and remember that the audience of the podcast is not expert on the subject, so don't complicate things too much.
-            It's very important that you answer in a "spoken" way, as if you were talking to someone in a conversation. That means you should avoid using scientific jargon and complex terms, too many figures or abstract concepts. 
-            Lists are also not recommended, instead use "for the first reason", "secondly", etc.
-            Instead, use "..." to indicate a pause, "-" to indicate a break in the sentence, as if you were speaking. 
-            """,
-            messages=self.chat.chat_history2bot()
-        )
-        self.toolbot = LLM(
-            temperature=0,
-            system_message=f"You are an assistant to an expert on {subject}. Choose one or many tools to use in order to assist the expert in answering questions. Make sure to read the description of the tools carefully.",
-            chat=False,
-            model="small",
-        )
-
-    def generate(self, query):
-        return self.llm.generate(query)
--- a/_bots_dont_use.py
+++ b/_bots_dont_use.py
@ -0,0 +1,497 @@
+from datetime import datetime
+import streamlit as st
+import uuid
+
+from _base_class import StreamlitBaseClass, BaseClass
+from _llm import LLM
+from _arango import ArangoDB
+from prompts import *
+from colorprinter.print_color import *
+from llm_tools import ToolRegistry
+from streamlit_chatbot import StreamlitBot, PodBot, EditorBot, ResearchAssistantBot
+
+class Chat(StreamlitBaseClass):
+    def __init__(self, username=None, **kwargs):
+        super().__init__(username=username, **kwargs)
+        self.name = kwargs.get("name", None)
+        self.chat_history = kwargs.get("chat_history", [])
+        self.role = kwargs.get("role", "Research Assistant")
+        self._key = kwargs.get("_key", str(uuid.uuid4()))
+        self.saved = kwargs.get("saved", False)
+        self.last_updated = kwargs.get("last_updated", datetime.now().isoformat())
+        self.message_attachments = None
+        self.project = kwargs.get("project", None)
+
+    def add_message(self, role, content):
+        self.chat_history.append(
+            {
+                "role": role,
+                "content": content.strip().strip('"'),
+                "role_type": self.role,
+            }
+        )
+
+    def to_dict(self):
+        return {
+            "_key": self._key,
+            "name": self.name,
+            "chat_history": self.chat_history,
+            "role": self.role,
+            "username": self.username,
+            "project": self.project,
+            "last_updated": self.last_updated,
+            "saved": self.saved,
+        }
+
+    def update_in_arango(self):
+        """Update chat in ArangoDB using the new API"""
+        self.last_updated = datetime.now().isoformat()
+        
+        # Use the create_or_update_chat method from the new API
+        self.user_arango.create_or_update_chat(self.to_dict())
+
+    def set_name(self, user_input):
+        llm = LLM(
+            model="small",
+            max_length_answer=50,
+            temperature=0.4,
+            system_message="You are a chatbot who will be chatting with a user",
+        )
+        prompt = (
+            f'Give a short name to the chat based on this user input: "{user_input}" '
+            "No more than 30 characters. Answer ONLY with the name of the chat."
+        )
+        name = llm.generate(prompt).content.strip('"')
+        name = f'{name} - {datetime.now().strftime("%B %d")}'
+        
+        # Check for existing chat with the same name
+        existing_chat = self.user_arango.execute_aql(
+            """
+            FOR chat IN chats 
+            FILTER chat.name == @name AND chat.username == @username
+            RETURN chat
+            """,
+            bind_vars={"name": name, "username": self.username}
+        )
+        
+        if list(existing_chat):
+            name = f'{name} ({datetime.now().strftime("%H:%M")})'
+        name += f" - [{self.role}]"
+        self.name = name
+        return name
+
+    def show_chat_history(self):
+        """Display chat history in the Streamlit UI"""
+        for message in self.chat_history:
+            with st.chat_message(
+                name="assistant" if message["role"] == "assistant" else "user", 
+                avatar=self.get_avatar(role=message["role"])
+            ):
+                st.write(message["content"])
+
+    def get_avatar(self, role):
+        """Get avatar for a role"""
+        if role == "user":
+            return None
+        elif role == "Host":
+            return "🎙️"
+        elif role == "Guest":
+            return "🎤" 
+        elif role == "assistant":
+            if self.role == "Research Assistant":
+                return "🔬"
+            elif self.role == "Editor":
+                return "📝"
+            else:
+                return "🤖"
+        return None
+
+    @classmethod
+    def from_dict(cls, data):
+        return cls(
+            username=data.get("username"),
+            name=data.get("name"),
+            chat_history=data.get("chat_history", []),
+            role=data.get("role", "Research Assistant"),
+            _key=data.get("_key"),
+            project=data.get("project"),
+            last_updated=data.get("last_updated"),
+            saved=data.get("saved", False),
+        )
+
+    def chat_history2bot(self, n_messages: int = None, remove_system: bool = False):
+        history = [
+            {"role": m["role"], "content": m["content"]} for m in self.chat_history
+        ]
+        if n_messages and len(history) > n_messages:
+            history = history[-n_messages:]
+            if (
+                all([history[0]["role"] == "system", remove_system])
+                or history[0]["role"] == "assistant"
+            ):
+                history = history[1:]
+        return history
+
+    
+class Bot(BaseClass):
+    def __init__(self, username: str, chat: Chat = None, tools: list = None, **kwargs):
+        super().__init__(username=username, **kwargs)
+
+        # Use the passed in chat or create a new Chat
+        self.chat = chat if chat else Chat(username=username, role="Research Assistant")
+        print_yellow(f"Chat:", chat, type(chat))
+        
+        # Store or set up project/collection if available
+        self.project = kwargs.get("project", None)
+        self.collection = kwargs.get("collection", None)
+        if self.collection and not isinstance(self.collection, list):
+            self.collection = [self.collection]
+
+        # Load articles in the collections using the new API
+        self.arango_ids = []
+        if self.collection:
+            for c in self.collection:
+                # Use execute_aql from the new API
+                article_ids = self.user_arango.execute_aql(
+                    """
+                    FOR doc IN article_collections
+                    FILTER doc.name == @collection
+                    FOR article IN doc.articles
+                        RETURN article
+                    """,
+                    bind_vars={"collection": c}
+                )
+                for _id in article_ids:
+                    self.arango_ids.append(_id)
+
+        # A standard LLM for normal chat
+        self.chatbot = LLM(messages=self.chat.chat_history2bot())
+        # A helper bot for generating queries or short prompts
+        self.helperbot = LLM(
+            temperature=0,
+            model="small",
+            max_length_answer=500,
+            system_message=get_query_builder_system_message(),
+            messages=self.chat.chat_history2bot(n_messages=4, remove_system=True),
+        )
+        # A specialized LLM picking which tool to use
+        self.toolbot = LLM(
+            temperature=0,
+            system_message="""
+            You are an assistant bot helping an answering bot to answer a user's messages. 
+            Your task is to choose one or multiple tools that will help the answering bot to provide the user with the best possible answer.
+            You should NEVER directly answer the user. You MUST choose a tool.
+            """,
+            chat=False,
+            model="small",
+        )
+
+        # Load or register the passed-in tools
+        if tools:
+            self.tools = ToolRegistry.get_tools(tools=tools)
+        else:
+            self.tools = ToolRegistry.get_tools()
+
+        # Store other kwargs
+        for arg in kwargs:
+            setattr(self, arg, kwargs[arg])
+
+    def get_chunks(
+        self,
+        user_input,
+        collections=["sci_articles", "other_documents"],
+        n_results=7,
+        n_sources=4,
+        filter=True,
+    ):
+        # Basic version without Streamlit calls
+        query = self.helperbot.generate(
+            get_generate_vector_query_prompt(user_input, self.chat.role)
+        ).content.strip('"')
+
+        combined_chunks = []
+        if collections:
+            for collection in collections:
+                where_filter = {"_id": {"$in": self.arango_ids}} if filter else {}
+                chunks = self.get_chromadb().query(
+                    query=query,
+                    collection=collection,
+                    n_results=n_results,
+                    n_sources=n_sources,
+                    where=where_filter,
+                    max_retries=3,
+                )
+                for doc, meta, dist in zip(
+                    chunks["documents"][0],
+                    chunks["metadatas"][0],
+                    chunks["distances"][0],
+                ):
+                    combined_chunks.append(
+                        {"document": doc, "metadata": meta, "distance": dist}
+                    )
+        combined_chunks.sort(key=lambda x: x["distance"])
+
+        # Keep the best chunks according to n_sources
+        sources = set()
+        closest_chunks = []
+        for chunk in combined_chunks:
+            source_id = chunk["metadata"].get("_id", "no_id")
+            if source_id not in sources:
+                sources.add(source_id)
+                closest_chunks.append(chunk)
+            if len(sources) >= n_sources:
+                break
+        if len(closest_chunks) < n_results:
+            remaining_chunks = [
+                c for c in combined_chunks if c not in closest_chunks
+            ]
+            closest_chunks.extend(remaining_chunks[: n_results - len(closest_chunks)])
+
+        # Now fetch real metadata from Arango using the new API
+        for chunk in closest_chunks:
+            _id = chunk["metadata"].get("_id")
+            if not _id:
+                continue
+            
+            try:
+                # Determine which database to use based on collection name
+                if _id.startswith("sci_articles"):
+                    # Use base_arango for common documents
+                    arango_doc = self.base_arango.get_document(_id)
+                else:
+                    # Use user_arango for user-specific documents
+                    arango_doc = self.user_arango.get_document(_id)
+                
+                if arango_doc:
+                    arango_metadata = arango_doc.get("metadata", {})
+                    # Possibly merge notes
+                    if "user_notes" in arango_doc:
+                        arango_metadata["user_notes"] = arango_doc["user_notes"]
+                    chunk["metadata"] = arango_metadata
+            except Exception as e:
+                print_red(f"Error fetching document {_id}: {e}")
+
+        # Group by article title
+        grouped_chunks = {}
+        article_number = 1
+        for chunk in closest_chunks:
+            title = chunk["metadata"].get("title", "No title")
+            chunk["article_number"] = article_number
+            if title not in grouped_chunks:
+                grouped_chunks[title] = {
+                    "article_number": article_number,
+                    "chunks": [],
+                }
+                article_number += 1
+            grouped_chunks[title]["chunks"].append(chunk)
+        return grouped_chunks
+
+    def answer_tool_call(self, response, user_input):
+        bot_responses = []
+        # This method returns / stores responses (no Streamlit calls)
+        if not response.get("tool_calls"):
+            return ""
+
+        for tool in response.get("tool_calls"):
+            function_name = tool.function.get('name')
+            arguments = tool.function.arguments
+            arguments["query"] = user_input
+
+            if hasattr(self, function_name):
+                if function_name in [
+                    "fetch_other_documents_tool",
+                    "fetch_science_articles_tool",
+                    "fetch_science_articles_and_other_documents_tool",
+                ]:
+                    chunks = getattr(self, function_name)(**arguments)
+                    bot_responses.append(
+                        self.generate_from_chunks(user_input, chunks).strip('"')
+                    )
+                elif function_name == "fetch_notes_tool":
+                    notes = getattr(self, function_name)()
+                    bot_responses.append(
+                        self.generate_from_notes(user_input, notes).strip('"')
+                    )
+                elif function_name == "conversational_response_tool":
+                    bot_responses.append(
+                        getattr(self, function_name)(user_input).strip('"')
+                    )
+        return "\n\n".join(bot_responses)
+
+    def process_user_input(self, user_input, content_attachment=None):
+        # Add user message
+        self.chat.add_message("user", user_input)
+
+        if not content_attachment:
+            prompt = get_tools_prompt(user_input)
+            response = self.toolbot.generate(prompt, tools=self.tools, stream=False)
+            if response.get("tool_calls"):
+                bot_response = self.answer_tool_call(response, user_input)
+            else:
+                # Just respond directly
+                bot_response = response.content.strip('"')
+        else:
+            # If there's an attachment, do something minimal
+            bot_response = "Content attachment received (Base Bot)."
+
+        # Add assistant message
+        if self.chat.chat_history[-1]["role"] != "assistant":
+            self.chat.add_message("assistant", bot_response)
+
+        # Update in Arango
+        self.chat.update_in_arango()
+        return bot_response
+
+    def generate_from_notes(self, user_input, notes):
+        # No Streamlit calls
+        notes_string = ""
+        for note in notes:
+            notes_string += f"\n# {note.get('title','No title')}\n{note.get('text','')}\n---\n"
+        prompt = get_chat_prompt(user_input, content_string=notes_string, role=self.chat.role)
+        return self.chatbot.generate(prompt, stream=True)
+
+    def generate_from_chunks(self, user_input, chunks):
+        # No Streamlit calls
+        chunks_string = ""
+        for title, group in chunks.items():
+            user_notes_string = ""
+            if "user_notes" in group["chunks"][0]["metadata"]:
+                notes = group["chunks"][0]["metadata"]["user_notes"]
+                user_notes_string = f'\n\nUser notes:\n"""\n{notes}\n"""\n\n'
+            docs = "\n(...)\n".join([c["document"] for c in group["chunks"]])
+            chunks_string += (
+                f"\n# {title}\n## Article #{group['article_number']}\n{user_notes_string}{docs}\n---\n"
+            )
+        prompt = get_chat_prompt(user_input, content_string=chunks_string, role=self.chat.role)
+        return self.chatbot.generate(prompt, stream=True)
+
+    def run(self):
+        # Base Bot has no Streamlit run loop
+        pass
+
+    def get_notes(self):
+        # Get project notes using the new API
+        if self.project and hasattr(self.project, "name"):
+            notes = self.user_arango.get_project_notes(
+                project_name=self.project.name,
+                username=self.username
+            )
+            return list(notes)
+        return []
+
+    @ToolRegistry.register
+    def fetch_science_articles_tool(self, query: str, n_documents: int):
+        """
+        "Fetches information from scientific articles. Use this tool when the user is looking for information from scientific articles."
+
+        Parameters:
+            query (str): The search query to find relevant scientific articles.
+            n_documents (int): How many documents to fetch. A complex query may require more documents. Min: 3, Max: 10.
+
+        Returns:
+            list: A list of chunks containing information from the fetched scientific articles.
+        """
+        print_purple('Query:', query)
+
+        n_documents = int(n_documents)
+        if n_documents < 3:
+            n_documents = 3
+        elif n_documents > 10:
+            n_documents = 10
+        return self.get_chunks(
+            query, collections=["sci_articles"], n_results=n_documents
+        )
+
+    @ToolRegistry.register
+    def fetch_other_documents_tool(self, query: str, n_documents: int):
+        """
+        Fetches information from other documents based on the user's query.
+
+        This method retrieves information from various types of documents such as reports, news articles, and other texts. It should be used only when it is clear that the user is not seeking scientific articles.
+
+        Args:
+            query (str): The search query provided by the user.
+            n_documents (int): How many documents to fetch. A complex query may require more documents. Min: 2, Max: 10.
+
+        Returns:
+            list: A list of document chunks that match the query.
+        """
+        assert isinstance(self, Bot), "The first argument must be a Bot object."
+        n_documents = int(n_documents)
+        if n_documents < 2:
+            n_documents = 2
+        elif n_documents > 10:
+            n_documents = 10
+        return self.get_chunks(
+            query,
+            collections=[f"{self.username}__other_documents"],
+            n_results=n_documents,
+        )
+
+    @ToolRegistry.register
+    def fetch_science_articles_and_other_documents_tool(
+        self, query: str, n_documents: int
+    ):
+        """
+        Fetches information from both scientific articles and other documents.
+
+        This method is often used when the user hasn't specified what kind of sources they are interested in.
+
+        Args:
+            query (str): The search query to fetch information for.
+            n_documents (int): How many documents to fetch. A complex query may require more documents. Min: 3, Max: 10.
+
+        Returns:
+            list: A list of document chunks that match the search query.
+        """
+        assert isinstance(self, Bot), "The first argument must be a Bot object."
+        n_documents = int(n_documents)
+        if n_documents < 3:
+            n_documents = 3
+        elif n_documents > 10:
+            n_documents = 10
+        return self.get_chunks(
+            query,
+            collections=["sci_articles", f"{self.username}__other_documents"],
+            n_results=n_documents,
+        )
+
+    @ToolRegistry.register
+    def fetch_notes_tool(bot):
+        """
+        Fetches information from the project notes when you as an editor need context from the project notes to understand other information. ONLY use this together with other tools! No arguments needed.
+
+        Returns:
+            list: A list of notes.
+        """
+        assert isinstance(bot, Bot), "The first argument must be a Bot object."
+        return bot.get_notes()
+
+    @ToolRegistry.register
+    def conversational_response_tool(self, query: str):
+        """
+        Generate a conversational response to a user's query.
+
+        This method is designed to provide a short and conversational response
+        without fetching additional data. It should be used only when it is clear
+        that the user is engaging in small talk (like saying 'hi') and not seeking detailed information.
+
+        Args:
+            query (str): The user's message to which the bot should respond.
+
+        Returns:
+            str: The generated conversational response.
+        """
+        query = f"""
+        User message: "{query}". 
+        Make your answer short and conversational. 
+        This is perhaps not a conversation about a journalistic project, so try not to be too informative.
+        Don't answer with anything you're not sure of! 
+        """
+
+        result = (
+            self.chatbot.generate(query, stream=True)
+            if self.chatbot
+            else self.llm.generate(query, stream=True)
+        )
+        return result
--- a/_chromadb.py
+++ b/_chromadb.py
@ -1,8 +1,13 @@
 import chromadb
 import os
+from typing import Union, List, Dict, Tuple, Any, Union
+import re
+
 from chromadb.config import Settings
 from dotenv import load_dotenv
 from colorprinter.print_color import *
+from models import ChunkSearchResults
+

 load_dotenv(".env")

@ -20,6 +25,7 @@ class ChromaDB:
            )
            self.db = chromadb.HttpClient(
                host=host,
+                #database=db,
                settings=Settings(
                    chroma_client_auth_provider="chromadb.auth.token_authn.TokenAuthClientProvider",
                    chroma_client_auth_credentials=credentials,
@ -63,14 +69,20 @@ class ChromaDB:
        col = self.db.get_collection(collection)
        sources = []
        n = 0
-
+        print('Collection', collection)
        result = {"ids": [[]], "metadatas": [[]], "documents": [[]], "distances": [[]]}
+        
        while True:
            n += 1
            if n > max_retries:
                break
            if where == {}:
-                where = None 
+                where = None
+
+            print_rainbow(kwargs)
+            print('N_results:', n_results)
+            print('Sources:', sources)
+            print('Query:', query)
            r = col.query(
                query_texts=query,
                n_results=n_results - len(sources),
@ -79,6 +91,7 @@ class ChromaDB:
            )
            if r["ids"][0] == []:
                if result["ids"][0] == []:
+                    print_rainbow(r)
                    print_red("No results found in vector database.")
                else:
                    print_red("No more results found in vector database.")
@ -123,6 +136,210 @@ class ChromaDB:
                break
        return result

+    def search(
+        self,
+        query: str,
+        collection: str,
+        n_results: int = 6,
+        n_sources: int = 3,
+        where: dict = None,
+        format_results: bool = False,
+        **kwargs,
+    ) -> Union[dict, ChunkSearchResults]:
+        """
+        An enhanced search method that provides a cleaner interface for querying and processing results.
+
+        Args:
+            query (str): The search query
+            collection (str): Collection name to search in
+            n_results (int): Maximum number of results to return
+            n_sources (int): Maximum number of unique sources to include
+            where (dict, optional): Additional filtering criteria
+            format_results (bool): Whether to return formatted ChunkSearchResults
+            **kwargs: Additional arguments to pass to the query
+
+        Returns:
+            List[dict]: List of dictionaries containing the search results
+        """
+        # Get raw query results with existing query method
+        result = self.query(
+            query=query,
+            collection=collection,
+            n_results=n_results,
+            n_sources=n_sources,
+            where=where,
+            **kwargs,
+        )
+
+
+        # If no formatting requested, return raw results
+        if not format_results:
+            return result
+
+        # Process results into dictionary format
+        combined_chunks = []
+        for doc, meta, dist, _id in zip(
+            result["documents"][0],
+            result["metadatas"][0],
+            result["distances"][0],
+            result["ids"][0],
+        ):
+            combined_chunks.append(
+                {"document": doc, "metadata": meta, "distance": dist, "id": _id}
+            )
+
+        return combined_chunks
+
+    def clean_result_text(self, documents: list) -> list:
+        """
+        Clean text in document results by removing footnote references.
+
+        Args:
+            documents (list): List of document dictionaries
+
+        Returns:
+            list: Documents with cleaned text
+        """
+        import re
+
+        for doc in documents:
+            if "document" in doc:
+                doc["document"] = re.sub(r"\[\d+\]", "", doc["document"])
+        return documents
+
+    def filter_by_unique_sources(
+        self, results: list, n_sources: int, source_key: str = "_id"
+    ) -> Tuple[List, List]:
+        """
+        Filters search results to keep only a specified number of unique sources.
+
+        Args:
+            results (list): List of documents from search
+            n_sources (int): Maximum number of unique sources to include
+            source_key (str): The key in metadata that identifies the source
+
+        Returns:
+            tuple: (filtered_results, remaining_results)
+        """
+        sources = set()
+        filtered_results = []
+        remaining_results = []
+
+        for item in results:
+            source_id = item["metadata"].get(source_key, "no_id")
+            if source_id not in sources and len(sources) < n_sources:
+                sources.add(source_id)
+                filtered_results.append(item)
+            else:
+                remaining_results.append(item)
+
+        return filtered_results, remaining_results
+
+    def backfill_results(
+        self, filtered_results: list, remaining_results: list, n_results: int
+    ) -> list:
+        """
+        Adds additional results from remaining_results to filtered_results
+        until n_results is reached.
+
+        Args:
+            filtered_results (list): Initial filtered results
+            remaining_results (list): Other results that can be added
+            n_results (int): Target number of total results
+
+        Returns:
+            list: Combined results up to n_results
+        """
+        if len(filtered_results) >= n_results:
+            return filtered_results[:n_results]
+
+        needed = n_results - len(filtered_results)
+        return filtered_results + remaining_results[:needed]
+
+    def search_chunks(
+        self,
+        query: str,
+        collections: List[str],
+        n_results: int = 7,
+        n_sources: int = 4,
+        where: dict = None,
+        **kwargs,
+    ) -> ChunkSearchResults:
+        """
+        Complete pipeline for processing chunks: search, filter, clean, and format.
+
+        Args:
+            query (str): The search query
+            collections (List[str]): List of collection names to search
+            n_results (int): Maximum number of results to return
+            n_sources (int): Maximum number of unique sources to include
+            where (dict, optional): Additional filtering criteria
+            **kwargs: Additional arguments to pass to search
+
+        Returns:
+            ChunkSearchResults: Processed chunks with Chroma IDs
+        """
+        combined_chunks = []
+
+        if isinstance(collections, str):
+            collections = [collections]
+
+        # Search all collections
+
+        for collection in collections:
+            chunks = self.search(
+                query=query,
+                collection=collection,
+                n_results=n_results,
+                n_sources=n_sources,
+                where=where,
+                format_results=True,
+                **kwargs,
+            )
+
+
+            for chunk in chunks:
+                combined_chunks.append({
+                    "document": chunk["document"],
+                    "metadata": chunk["metadata"],
+                    "distance": chunk["distance"],
+                    "id": chunk["id"],
+                })
+
+        # Sort and filter results
+        combined_chunks.sort(key=lambda x: x["distance"])
+
+        # Filter by unique sources and backfill
+        closest_chunks, remaining_chunks = self.filter_by_unique_sources(
+            combined_chunks, n_sources
+        )
+        closest_chunks = self.backfill_results(
+            closest_chunks, remaining_chunks, n_results
+        )
+
+        # Clean text
+        closest_chunks = self.clean_result_text(closest_chunks)
+        return closest_chunks
+
+
+    def add_document(self, _id, collection: str, document: str, metadata: dict = None):
+        """
+        Adds a single document to a specified collection in the database.
+
+        Args:
+            _id (str): Arango ID for the document, used as a unique identifier.
+            collection (str): The name of the collection to add the document to.
+            document (str): The document text to be added.
+            metadata (dict, optional): Metadata to be associated with the document. Defaults to None.
+
+        Returns:
+            None
+        """
+        col = self.db.get_or_create_collection(collection)
+        if metadata is None:
+            metadata = {}
+        col.add(ids=[_id], documents=[document], metadatas=[metadata])
+
    def add_chunks(self, collection: str, chunks: list, _key, metadata: dict = None):
        """
        Adds chunks to a specified collection in the database.
@ -148,18 +365,88 @@ class ChromaDB:
            ids.append(f"{_key}_{number}")
        col.add(ids=ids, metadatas=metadatas, documents=chunks)

+    def get_collection(self, collection: str) -> chromadb.Collection:
+        """
+        Retrieves a collection from the database.
+
+        Args:
+            collection (str): The name of the collection to retrieve.
+
+        Returns:
+            chromadb.Collection: The requested collection.
+        """
+        return self.db.get_or_create_collection(collection)
+
+def is_reference_chunk(text: str) -> bool:
+    """
+    Determine if a text chunk primarily consists of academic references.
+    
+    Args:
+        text (str): Text chunk to analyze
+        
+    Returns:
+        bool: True if the chunk appears to be mainly references
+    """
+    # Count significant reference indicators
+    indicators = 0
+    
+    # Check for DOI links (very strong indicator)
+    doi_matches = len(re.findall(r'https?://doi\.org/10\.\d+/\S+', text))
+    if doi_matches >= 2:  # Multiple DOIs almost certainly means references
+        return True
+    elif doi_matches == 1:
+        indicators += 3
+    
+    # Check for citation patterns with year, volume, pages (e.g., 2018;178:551–60)
+    citation_patterns = len(re.findall(r'\d{4};\d+:\d+[-–]\d+', text))
+    indicators += citation_patterns * 2
+    
+    # Check for year patterns in brackets [YYYY]
+    year_brackets = len(re.findall(r'\[\d{4}\]', text))
+    indicators += year_brackets
+    
+    # Check for multiple lines starting with author name patterns
+    lines = [line.strip() for line in text.split('\n') if line.strip()]
+    author_started_lines = 0
+    
+    for line in lines:
+        # Common pattern in references: starts with Author Name(s)
+        if re.match(r'^\s*[A-Z][a-z]+\s+[A-Z][a-z]+', line):
+            author_started_lines += 1
+    
+    # If multiple lines start with author names (common in reference lists)
+    if author_started_lines >= 2:
+        indicators += 2
+    
+    # Check for academic reference terms
+    if re.search(r'\bet al\b|\bet al\.\b', text, re.IGNORECASE):
+        indicators += 1
+    
+    # Return True if we have sufficient indicators
+    return indicators >= 4  # Adjust threshold as needed

 if __name__ == "__main__":
    from colorprinter.print_color import *

+
    chroma = ChromaDB()
    print(chroma.db.list_collections())
-    exit()
-    result = chroma.query(
+    print('DB', chroma.db.database)
+    print('SETTINGS', chroma.db.get_version())
+
+    result = chroma.search_chunks(
        query="What is Open Science)",
-        collection="sci_articles",
+        collections="lasse__other_documents",
        n_results=2,
        n_sources=3,
        max_retries=4,
    )
-    print_rainbow(result["metadatas"][0])
+
+    collection = chroma.db.get_or_create_collection("lasse__other_documents")
+    result = collection.query(
+        query_texts="What is Open Science?",
+        n_results=2,
+    )
+    from pprint import pprint
+    pprint(result)
+    #print_rainbow(result["metadatas"][0])
--- a/_llm.py
+++ b/_llm.py
@ -1,574 +0,0 @@
-import os
-import base64
-import re
-from typing import Literal, Optional
-import requests
-import tiktoken
-from ollama import (
-    Client,
-    AsyncClient,
-    ResponseError,
-    ChatResponse,
-    Tool,
-    Options,
-)
-
-import env_manager
-from colorprinter.print_color import *
-
-env_manager.set_env()
-
-tokenizer = tiktoken.get_encoding("cl100k_base")
-
-
-class LLM:
-    """
-    LLM class for interacting with an instance of Ollama.
-
-    Attributes:
-        model (str): The model to be used for response generation.
-        system_message (str): The system message to be used in the chat.
-        options (dict): Options for the model, such as temperature.
-        messages (list): List of messages in the chat.
-        max_length_answer (int): Maximum length of the generated answer.
-        chat (bool): Whether the chat mode is enabled.
-        chosen_backend (str): The chosen backend server for the API.
-        client (Client): The client for synchronous API calls.
-        async_client (AsyncClient): The client for asynchronous API calls.
-        tools (list): List of tools to be used in generating the response.
-
-    Methods:
-        __init__(self, system_message, temperature, model, max_length_answer, messages, chat, chosen_backend):
-            Initializes the LLM class with the provided parameters.
-
-        get_model(self, model_alias):
-            Retrieves the model name based on the provided alias.
-
-        count_tokens(self):
-            Counts the number of tokens in the messages.
-
-        get_least_conn_server(self):
-            Retrieves the least connected server from the backend.
-
-        generate(self, query, user_input, context, stream, tools, images, model, temperature):
-            Generates a response based on the provided query and options.
-
-        make_summary(self, text):
-            Generates a summary of the provided text.
-
-        read_stream(self, response):
-            Handles streaming responses.
-
-        async_generate(self, query, user_input, context, stream, tools, images, model, temperature):
-            Asynchronously generates a response based on the provided query and options.
-
-        prepare_images(self, images, message):
-    """
-
-    def __init__(
-        self,
-        system_message: str = "You are an assistant.",
-        temperature: float = 0.01,
-        model: Optional[
-            Literal["small", "standard", "vision", "reasoning", "tools"]
-        ] = "standard",
-        max_length_answer: int = 4096,
-        messages: list[dict] = None,
-        chat: bool = True,
-        chosen_backend: str = None,
-        tools: list = None,
-    ) -> None:
-        """
-        Initialize the assistant with the given parameters.
-
-        Args:
-            system_message (str): The initial system message for the assistant. Defaults to "You are an assistant.".
-            temperature (float): The temperature setting for the model, affecting randomness. Defaults to 0.01.
-            model (Optional[Literal["small", "standard", "vision", "reasoning"]]): The model type to use. Defaults to "standard".
-            max_length_answer (int): The maximum length of the generated answer. Defaults to 4096.
-            messages (list[dict], optional): A list of initial messages. Defaults to None.
-            chat (bool): Whether the assistant is in chat mode. Defaults to True.
-            chosen_backend (str, optional): The backend server to use. If not provided, the least connected server is chosen.
-
-        Returns:
-            None
-        """
-
-        self.model = self.get_model(model)
-        self.call_model = (
-            self.model
-        )  # This is set per call to decide what model that was actually used
-        self.system_message = system_message
-        self.options = {"temperature": temperature}
-        self.messages = messages or [{"role": "system", "content": self.system_message}]
-        self.max_length_answer = max_length_answer
-        self.chat = chat
-
-        if not chosen_backend:
-            chosen_backend = self.get_least_conn_server()
-        self.chosen_backend = chosen_backend
-
-
-        headers = {
-            "Authorization": f"Basic {self.get_credentials()}",
-            "X-Chosen-Backend": self.chosen_backend,
-        }
-        self.host_url = os.getenv("LLM_API_URL").rstrip("/api/chat/")
-        self.host_url = 'http://192.168.1.12:3300' #! Change back when possible
-        self.client: Client = Client(host=self.host_url, headers=headers, timeout=120)           
-        self.async_client: AsyncClient = AsyncClient()
-
-    def get_credentials(self):
-        # Initialize the client with the host and default headers
-        credentials = f"{os.getenv('LLM_API_USER')}:{os.getenv('LLM_API_PWD_LASSE')}"
-        return base64.b64encode(credentials.encode()).decode()
-    
-    def get_model(self, model_alias):
-
-        models = {
-            "standard": "LLM_MODEL",
-            "small": "LLM_MODEL_SMALL",
-            "vision": "LLM_MODEL_VISION",
-            "standard_64k": "LLM_MODEL_LARGE",
-            "reasoning": "LLM_MODEL_REASONING",
-            "tools": "LLM_MODEL_TOOLS",
-        }
-        model = os.getenv(models.get(model_alias, "LLM_MODEL"))
-        self.model = model
-        return model
-
-    def count_tokens(self):
-        num_tokens = 0
-        for i in self.messages:
-            for k, v in i.items():
-                if k == "content":
-                    if not isinstance(v, str):
-                        v = str(v)
-                    tokens = tokenizer.encode(v)
-                    num_tokens += len(tokens)
-        return int(num_tokens)
-
-    def get_least_conn_server(self):
-        try:
-            response = requests.get("http://192.168.1.12:5000/least_conn")
-            response.raise_for_status()
-            # Extract the least connected server from the response
-            least_conn_server = response.headers.get("X-Upstream-Address")
-            return least_conn_server
-        except requests.RequestException as e:
-            print_red("Error getting least connected server:", e)
-            return None
-
-    def generate(
-        self,
-        query: str = None,
-        user_input: str = None,
-        context: str = None,
-        stream: bool = False,
-        tools: list = None,
-        images: list = None,
-        model: Optional[
-            Literal["small", "standard", "vision", "reasoning", "tools"]
-        ] = None,
-        temperature: float = None,
-        messages: list[dict] = None,
-        format = None,
-        think = False
-    ):
-        """
-        Generate a response based on the provided query and context.
-        Parameters:
-        query (str): The query string from the user.
-        user_input (str): Additional user input to be appended to the last message.
-        context (str): Contextual information to be used in generating the response.
-        stream (bool): Whether to stream the response.
-        tools (list): List of tools to be used in generating the response.
-        images (list): List of images to be included in the response.
-        model (Optional[Literal["small", "standard", "vision", "tools"]]): The model type to be used.
-        temperature (float): The temperature setting for the model.
-        messages (list[dict]): List of previous messages in the conversation.
-        format (Optional[BaseModel]): The format of the response.
-        think (bool): Whether to use the reasoning model.
-
-        Returns:
-        str: The generated response or an error message if an exception occurs.
-        """
-        print_yellow(stream)
-        print_yellow("GENERATE")
-        # Prepare the model and temperature
-        
-        model = self.get_model(model) if model else self.model
-        # if model == self.get_model('tools'):
-        #     stream = False
-        temperature = temperature if temperature else self.options["temperature"]
-
-        if messages:
-            messages = [
-                {"role": i["role"], "content": re.sub(r"\s*\n\s*", "\n", i["content"])}
-                for i in messages
-            ]
-            message = messages.pop(-1)
-            query = message["content"]
-            self.messages = messages
-        else:
-            # Normalize whitespace and add the query to the messages
-            query = re.sub(r"\s*\n\s*", "\n", query)
-            message = {"role": "user", "content": query}
-
-        # Handle images if any
-        if images:
-            message = self.prepare_images(images, message)
-            model = self.get_model("vision")
-
-        self.messages.append(message)
-
-        # Prepare headers
-        headers = {"Authorization": f"Basic {self.get_credentials()}"}
-        if self.chosen_backend and model not in [self.get_model("vision"), self.get_model("tools"), self.get_model("reasoning")]: #TODO Maybe reasoning shouldn't be here.
-            headers["X-Chosen-Backend"] = self.chosen_backend
-
-        if model == self.get_model("small"):
-            headers["X-Model-Type"] = "small"
-        if model == self.get_model("tools"):
-            headers["X-Model-Type"] = "tools"
-
-        reasoning_models = ['qwen3', 'deepseek'] #TODO Add more reasoning models here when added to ollama
-        if any([model_name in model for model_name in reasoning_models]):
-            if think:
-                query = f"/think\n{query}"
-            else:
-                query = f"/no_think\n{query}"
-
-        # Prepare options
-        options = Options(**self.options)
-        options.temperature = temperature
-
-        print_yellow("Stream the answer?", stream)
-
-        # Call the client.chat method
-        try:
-            self.call_model = model
-            self.client: Client = Client(host=self.host_url, headers=headers, timeout=300) #!
-            #print_rainbow(self.client._client.__dict__)
-            print_yellow("Model used in call:", model)
-            # if headers:
-            #     self.client.headers.update(headers)
-        
-            response = self.client.chat(
-                model=model,
-                messages=self.messages,
-                tools=tools,
-                stream=stream,
-                options=options,
-                keep_alive=3600 * 24 * 7,
-                format=format
-            )
-        
-        except ResponseError as e:
-            print_red("Error!")
-            print(e)
-            return "An error occurred."
-        # print_rainbow(response.__dict__)
-        # If user_input is provided, update the last message
-
-        if user_input:
-            if context:
-                if len(context) > 2000:
-                    context = self.make_summary(context)
-                user_input = (
-                    f"{user_input}\n\nUse the information below to answer the question.\n"
-                    f'"""{context}"""\n[This is a summary of the context provided in the original message.]'
-                )
-                system_message_info = "\nSometimes some of the messages in the chat history are summarised, then that is clearly indicated in the message."
-                if system_message_info not in self.messages[0]["content"]:
-                    self.messages[0]["content"] += system_message_info
-            self.messages[-1] = {"role": "user", "content": user_input}
-
-        # self.chosen_backend = self.client.last_response.headers.get("X-Chosen-Backend")
-
-        # Handle streaming response
-        if stream:
-            print_purple("STREAMING")
-            return self.read_stream(response)
-        else:
-            print_purple("NOT STREAMING")
-            # Process the response
-            if isinstance(response, ChatResponse):
-                result = response.message.content.strip('"')
-                if '</think>' in result:
-                    result = result.split('</think>')[-1]
-                self.messages.append(
-                    {"role": "assistant", "content": result.strip('"')}
-                )
-                if tools and not response.message.get("tool_calls"):
-                    print_yellow("No tool calls in response".upper())
-                if not self.chat:
-                    self.messages = [self.messages[0]]
-
-                if not think:
-                    response.message.content = remove_thinking(response.message.content)
-                return response.message
-            else:
-                print_red("Unexpected response type")
-                return "An error occurred."
-
-    def make_summary(self, text):
-        # Implement your summary logic using self.client.chat()
-        summary_message = {
-            "role": "user",
-            "content": f'Summarize the text below:\n"""{text}"""\nRemember to be concise and detailed. Answer in English.',
-        }
-        messages = [
-            {
-                "role": "system",
-                "content": "You are summarizing a text. Make it detailed and concise. Answer ONLY with the summary. Don't add any new information.",
-            },
-            summary_message,
-        ]
-        try:
-            response = self.client.chat(
-                model=self.get_model("small"),
-                messages=messages,
-                options=Options(temperature=0.01),
-                keep_alive=3600 * 24 * 7,
-            )
-            summary = response.message.content.strip()
-            print_blue("Summary:", summary)
-            return summary
-        except ResponseError as e:
-            print_red("Error generating summary:", e)
-            return "Summary generation failed."
-
-    def read_stream(self, response):
-        """
-        Yields tuples of (chunk_type, text). The first tuple is ('thinking', ...)
-        if in_thinking is True and stops at </think>. After that, yields ('normal', ...)
-        for the rest of the text.
-        """
-        thinking_buffer = ""
-        in_thinking = self.call_model == self.get_model("reasoning")
-        first_chunk = True
-        prev_content = None
-
-        for chunk in response:
-            if not chunk:
-                continue
-            content = chunk.message.content
-
-            # Remove leading quote if it's the first chunk
-            if first_chunk and content.startswith('"'):
-                content = content[1:]
-            first_chunk = False
-
-            if in_thinking:
-                thinking_buffer += content
-                if "</think>" in thinking_buffer:
-                    end_idx = thinking_buffer.index("</think>") + len("</think>")
-                    yield ("thinking", thinking_buffer[:end_idx])
-                    remaining = thinking_buffer[end_idx:].strip('"')
-                    if chunk.done and remaining:
-                        yield ("normal", remaining)
-                        break
-                    else:
-                        prev_content = remaining
-                    in_thinking = False
-            else:
-                if prev_content:
-                    yield ("normal", prev_content)
-                prev_content = content
-
-            if chunk.done:
-                if prev_content and prev_content.endswith('"'):
-                    prev_content = prev_content[:-1]
-                if prev_content:
-                    yield ("normal", prev_content)
-                break
-
-        self.messages.append({"role": "assistant", "content": ""})
-
-    async def async_generate(
-        self,
-        query: str = None,
-        user_input: str = None,
-        context: str = None,
-        stream: bool = False,
-        tools: list = None,
-        images: list = None,
-        model: Optional[Literal["small", "standard", "vision"]] = None,
-        temperature: float = None,
-    ):
-        """
-        Asynchronously generates a response based on the provided query and other parameters.
-
-        Args:
-            query (str, optional): The query string to generate a response for.
-            user_input (str, optional): Additional user input to be included in the response.
-            context (str, optional): Context information to be used in generating the response.
-            stream (bool, optional): Whether to stream the response. Defaults to False.
-            tools (list, optional): List of tools to be used in generating the response. Will set the model to 'tools'.
-            images (list, optional): List of images to be included in the response.
-            model (Optional[Literal["small", "standard", "vision", "tools"]], optional): The model to be used for generating the response.
-            temperature (float, optional): The temperature setting for the model.
-
-        Returns:
-            str: The generated response or an error message if an exception occurs.
-
-        Raises:
-            ResponseError: If an error occurs during the response generation.
-
-        Notes:
-            - The function prepares the model and temperature settings.
-            - It normalizes whitespace in the query and handles images if provided.
-            - It prepares headers and options for the request.
-            - It adjusts options for long messages and calls the async client's chat method.
-            - If user_input is provided, it updates the last message.
-            - It updates the chosen backend based on the response headers.
-            - It handles streaming responses and processes the response accordingly.
-            - It's not neccecary to set model to 'tools' if you provide tools as an argument.
-        """
-        print_yellow("ASYNC GENERATE")
-        # Normaliz e whitespace and add the query to the messages
-        query = re.sub(r"\s*\n\s*", "\n", query)
-        message = {"role": "user", "content": query}
-        self.messages.append(message)
-
-        # Prepare the model and temperature
-        model = self.get_model(model) if model else self.model
-        temperature = temperature if temperature else self.options["temperature"]
-
-        # Prepare options
-        options = Options(**self.options)
-        options.temperature = temperature
-
-        # Prepare headers
-        headers = {}
-
-        # Set model depending on the input
-        if images:
-            message = self.prepare_images(images, message)
-            model = self.get_model("vision")
-        elif tools:
-            model = self.get_model("tools")
-            headers["X-Model-Type"] = "tools"
-            tools = [Tool(**tool) if isinstance(tool, dict) else tool for tool in tools]
-        elif self.chosen_backend and model not in [self.get_model("vision"), self.get_model("tools"), self.get_model("reasoning")]:
-            headers["X-Chosen-Backend"] = self.chosen_backend
-        elif model == self.get_model("small"):
-            headers["X-Model-Type"] = "small"
-
-        # Adjust options for long messages
-        if self.chat or len(self.messages) > 15000:
-            num_tokens = self.count_tokens() + self.max_length_answer // 2
-            if num_tokens > 8000 and model not in [
-                self.get_model("vision"),
-                self.get_model("tools"),
-            ]:
-                model = self.get_model("standard_64k")
-                headers["X-Model-Type"] = "large"
-
-        # Call the async client's chat method
-        try:
-            response = await self.async_client.chat(
-                model=model,
-                messages=self.messages,
-                headers=headers,
-                tools=tools,
-                stream=stream,
-                options=options,
-                keep_alive=3600 * 24 * 7,
-            )
-        except ResponseError as e:
-            print_red("Error!")
-            print(e)
-            return "An error occurred."
-
-        # If user_input is provided, update the last message
-        if user_input:
-            if context:
-                if len(context) > 2000:
-                    context = self.make_summary(context)
-                user_input = (
-                    f"{user_input}\n\nUse the information below to answer the question.\n"
-                    f'"""{context}"""\n[This is a summary of the context provided in the original message.]'
-                )
-                system_message_info = "\nSometimes some of the messages in the chat history are summarised, then that is clearly indicated in the message."
-                if system_message_info not in self.messages[0]["content"]:
-                    self.messages[0]["content"] += system_message_info
-            self.messages[-1] = {"role": "user", "content": user_input}
-
-        print_red(self.async_client.last_response.headers.get("X-Chosen-Backend", "No backend"))
-        # Update chosen_backend
-        if model not in [self.get_model("vision"), self.get_model("tools"), self.get_model("reasoning")]:
-            self.chosen_backend = self.async_client.last_response.headers.get(
-                "X-Chosen-Backend"
-            )
-
-        # Handle streaming response
-        if stream:
-            return self.read_stream(response)
-        else:
-            # Process the response
-            if isinstance(response, ChatResponse):
-                result = response.message.content.strip('"')
-                self.messages.append(
-                    {"role": "assistant", "content": result.strip('"')}
-                )
-                if tools and not response.message.get("tool_calls"):
-                    print_yellow("No tool calls in response".upper())
-                if not self.chat:
-                    self.messages = [self.messages[0]]
-                return result
-            else:
-                print_red("Unexpected response type")
-                return "An error occurred."
-
-    def prepare_images(self, images, message):
-        """
-        Prepares a list of images by converting them to base64 encoded strings and adds them to the provided message dictionary.
-        Args:
-            images (list): A list of images, where each image can be a file path (str), a base64 encoded string (str), or bytes.
-            message (dict): A dictionary to which the base64 encoded images will be added under the key "images".
-        Returns:
-            dict: The updated message dictionary with the base64 encoded images added under the key "images".
-        Raises:
-            ValueError: If an image is not a string or bytes.
-        """
-        import base64
-
-        base64_images = []
-        base64_pattern = re.compile(r"^[A-Za-z0-9+/]+={0,2}$")
-
-        for image in images:
-            if isinstance(image, str):
-                if base64_pattern.match(image):
-                    base64_images.append(image)
-                else:
-                    with open(image, "rb") as image_file:
-                        base64_images.append(
-                            base64.b64encode(image_file.read()).decode("utf-8")
-                        )
-            elif isinstance(image, bytes):
-                base64_images.append(base64.b64encode(image).decode("utf-8"))
-            else:
-                print_red("Invalid image type")
-
-            message["images"] = base64_images
-            # Use the vision model
-
-            return message
-
-def remove_thinking(response):
-    """Remove the thinking section from the response"""
-    response_text = response.content if hasattr(response, "content") else str(response)
-    if "</think>" in response_text:
-        return response_text.split("</think>")[1].strip()
-    return response_text
-    
-if __name__ == "__main__":
-
-    llm = LLM()
-
-    result = llm.generate(
-        query="I want to add 2 and 2",
-    )
-    print(result.content)
--- a/_llmOLD.py
+++ b/_llmOLD.py
@ -0,0 +1,581 @@
+from _llm import LLM
+
+
+if __name__ == "__main__":
+    llm = LLM()
+
+    result = llm.generate(
+        query="I want to add 2 and 2",
+        think=True,
+    )
+    print(result)
+# import os
+# import base64
+# import re
+# from typing import Literal, Optional
+# from pydantic import BaseModel
+# import requests
+# import tiktoken
+# from ollama import (
+#     Client,
+#     AsyncClient,
+#     ResponseError,
+#     ChatResponse,
+#     Tool,
+#     Options,
+# )
+
+# import env_manager
+# from colorprinter.print_color import *
+
+# env_manager.set_env()
+
+# tokenizer = tiktoken.get_encoding("cl100k_base")
+
+
+# class LLM:
+#     """
+#     LLM class for interacting with an instance of Ollama.
+
+#     Attributes:
+#         model (str): The model to be used for response generation.
+#         system_message (str): The system message to be used in the chat.
+#         options (dict): Options for the model, such as temperature.
+#         messages (list): List of messages in the chat.
+#         max_length_answer (int): Maximum length of the generated answer.
+#         chat (bool): Whether the chat mode is enabled.
+#         chosen_backend (str): The chosen backend server for the API.
+#         client (Client): The client for synchronous API calls.
+#         async_client (AsyncClient): The client for asynchronous API calls.
+#         tools (list): List of tools to be used in generating the response.
+
+#     Methods:
+#         __init__(self, system_message, temperature, model, max_length_answer, messages, chat, chosen_backend):
+#             Initializes the LLM class with the provided parameters.
+
+#         get_model(self, model_alias):
+#             Retrieves the model name based on the provided alias.
+
+#         count_tokens(self):
+#             Counts the number of tokens in the messages.
+
+#         get_least_conn_server(self):
+#             Retrieves the least connected server from the backend.
+
+#         generate(self, query, user_input, context, stream, tools, images, model, temperature):
+#             Generates a response based on the provided query and options.
+
+#         make_summary(self, text):
+#             Generates a summary of the provided text.
+
+#         read_stream(self, response):
+#             Handles streaming responses.
+
+#         async_generate(self, query, user_input, context, stream, tools, images, model, temperature):
+#             Asynchronously generates a response based on the provided query and options.
+
+#         prepare_images(self, images, message):
+#     """
+
+#     def __init__(
+#         self,
+#         system_message: str = "You are an assistant.",
+#         temperature: float = 0.01,
+#         model: Optional[
+#             Literal["small", "standard", "vision", "reasoning", "tools"]
+#         ] = "standard",
+#         max_length_answer: int = 4096,
+#         messages: list[dict] = None,
+#         chat: bool = True,
+#         chosen_backend: str = None,
+#         tools: list = None,
+#     ) -> None:
+#         """
+#         Initialize the assistant with the given parameters.
+
+#         Args:
+#             system_message (str): The initial system message for the assistant. Defaults to "You are an assistant.".
+#             temperature (float): The temperature setting for the model, affecting randomness. Defaults to 0.01.
+#             model (Optional[Literal["small", "standard", "vision", "reasoning"]]): The model type to use. Defaults to "standard".
+#             max_length_answer (int): The maximum length of the generated answer. Defaults to 4096.
+#             messages (list[dict], optional): A list of initial messages. Defaults to None.
+#             chat (bool): Whether the assistant is in chat mode. Defaults to True.
+#             chosen_backend (str, optional): The backend server to use. If not provided, the least connected server is chosen.
+
+#         Returns:
+#             None
+#         """
+
+#         self.model = self.get_model(model)
+#         self.call_model = (
+#             self.model
+#         )  # This is set per call to decide what model that was actually used
+#         self.system_message = system_message
+#         self.options = {"temperature": temperature}
+#         self.messages = messages or [{"role": "system", "content": self.system_message}]
+#         self.max_length_answer = max_length_answer
+#         self.chat = chat
+
+#         if not chosen_backend:
+#             chosen_backend = self.get_least_conn_server()
+#         self.chosen_backend = chosen_backend
+
+
+#         headers = {
+#             "Authorization": f"Basic {self.get_credentials()}",
+#             "X-Chosen-Backend": self.chosen_backend,
+#         }
+#         self.host_url = os.getenv("LLM_API_URL").rstrip("/api/chat/")
+#         self.host_url = 'http://192.168.1.12:3300' #! Change back when possible
+#         self.client: Client = Client(host=self.host_url, headers=headers, timeout=240)           
+#         self.async_client: AsyncClient = AsyncClient()
+
+#     def get_credentials(self):
+#         # Initialize the client with the host and default headers
+#         credentials = f"{os.getenv('LLM_API_USER')}:{os.getenv('LLM_API_PWD_LASSE')}"
+#         return base64.b64encode(credentials.encode()).decode()
+    
+#     def get_model(self, model_alias):
+
+#         models = {
+#             "standard": "LLM_MODEL",
+#             "small": "LLM_MODEL_SMALL",
+#             "vision": "LLM_MODEL_VISION",
+#             "standard_64k": "LLM_MODEL_LARGE",
+#             "reasoning": "LLM_MODEL_REASONING",
+#             "tools": "LLM_MODEL_TOOLS",
+#         }
+#         model = os.getenv(models.get(model_alias, "LLM_MODEL"))
+#         self.model = model
+#         return model
+
+#     def count_tokens(self):
+#         num_tokens = 0
+#         for i in self.messages:
+#             for k, v in i.items():
+#                 if k == "content":
+#                     if not isinstance(v, str):
+#                         v = str(v)
+#                     tokens = tokenizer.encode(v)
+#                     num_tokens += len(tokens)
+#         return int(num_tokens)
+
+#     def get_least_conn_server(self):
+#         try:
+#             response = requests.get("http://192.168.1.12:5000/least_conn")
+#             response.raise_for_status()
+#             # Extract the least connected server from the response
+#             least_conn_server = response.headers.get("X-Upstream-Address")
+#             return least_conn_server
+#         except requests.RequestException as e:
+#             print_red("Error getting least connected server:", e)
+#             return None
+
+#     def generate(
+#         self,
+#         query: str = None,
+#         user_input: str = None,
+#         context: str = None,
+#         stream: bool = False,
+#         tools: list = None,
+#         images: list = None,
+#         model: Optional[
+#             Literal["small", "standard", "vision", "reasoning", "tools"]
+#         ] = None,
+#         temperature: float = None,
+#         messages: list[dict] = None,
+#         format: BaseModel = None,
+#         think: bool = False
+#     ):
+#         """
+#         Generate a response based on the provided query and context.
+#         Parameters:
+#         query (str): The query string from the user.
+#         user_input (str): Additional user input to be appended to the last message.
+#         context (str): Contextual information to be used in generating the response.
+#         stream (bool): Whether to stream the response.
+#         tools (list): List of tools to be used in generating the response.
+#         images (list): List of images to be included in the response.
+#         model (Optional[Literal["small", "standard", "vision", "tools"]]): The model type to be used.
+#         temperature (float): The temperature setting for the model.
+#         messages (list[dict]): List of previous messages in the conversation.
+#         format (Optional[BaseModel]): The format of the response.
+#         think (bool): Whether to use the reasoning model.
+
+#         Returns:
+#         str: The generated response or an error message if an exception occurs.
+#         """
+
+#         # Prepare the model and temperature
+        
+#         model = self.get_model(model) if model else self.model
+#         # if model == self.get_model('tools'):
+#         #     stream = False
+#         temperature = temperature if temperature else self.options["temperature"]
+
+#         if messages:
+#             messages = [
+#                 {"role": i["role"], "content": re.sub(r"\s*\n\s*", "\n", i["content"])}
+#                 for i in messages
+#             ]
+#             message = messages.pop(-1)
+#             query = message["content"]
+#             self.messages = messages
+#         else:
+#             # Normalize whitespace and add the query to the messages
+#             query = re.sub(r"\s*\n\s*", "\n", query)
+#             message = {"role": "user", "content": query}
+
+#         # Handle images if any
+#         if images:
+#             message = self.prepare_images(images, message)
+#             model = self.get_model("vision")
+
+#         self.messages.append(message)
+
+#         # Prepare headers
+#         headers = {"Authorization": f"Basic {self.get_credentials()}"}
+#         if self.chosen_backend and model not in [self.get_model("vision"), self.get_model("tools"), self.get_model("reasoning")]: #TODO Maybe reasoning shouldn't be here.
+#             headers["X-Chosen-Backend"] = self.chosen_backend
+
+#         if model == self.get_model("small"):
+#             headers["X-Model-Type"] = "small"
+#         if model == self.get_model("tools"):
+#             headers["X-Model-Type"] = "tools"
+
+#         reasoning_models = ['qwen3', 'deepseek'] #TODO Add more reasoning models here when added to ollama
+#         if any([model_name in model for model_name in reasoning_models]):
+#             if think:
+#                 self.messages[-1]['content'] = f"/think\n{self.messages[-1]['content']}"
+#             else:
+#                 self.messages[-1]['content'] = f"/no_think\n{self.messages[-1]['content']}"
+
+#         # Prepare options
+#         options = Options(**self.options)
+#         options.temperature = temperature
+
+#         # Call the client.chat method
+#         try:
+#             self.call_model = model
+#             self.client: Client = Client(host=self.host_url, headers=headers, timeout=300) #!
+#             #print_rainbow(self.client._client.__dict__)
+#             print_yellow(f"🤖 Generating using {model}...")
+#             # if headers:
+#             #     self.client.headers.update(headers)
+#             response = self.client.chat(
+#                 model=model,
+#                 messages=self.messages,
+#                 tools=tools,
+#                 stream=stream,
+#                 options=options,
+#                 keep_alive=3600 * 24 * 7,
+#                 format=format
+#             )
+        
+#         except ResponseError as e:
+#             print_red("Error!")
+#             print(e)
+#             return "An error occurred."
+#         # print_rainbow(response.__dict__)
+#         # If user_input is provided, update the last message
+
+#         if user_input:
+#             if context:
+#                 if len(context) > 2000:
+#                     context = self.make_summary(context)
+#                 user_input = (
+#                     f"{user_input}\n\nUse the information below to answer the question.\n"
+#                     f'"""{context}"""\n[This is a summary of the context provided in the original message.]'
+#                 )
+#                 system_message_info = "\nSometimes some of the messages in the chat history are summarised, then that is clearly indicated in the message."
+#                 if system_message_info not in self.messages[0]["content"]:
+#                     self.messages[0]["content"] += system_message_info
+#             self.messages[-1] = {"role": "user", "content": user_input}
+
+#         # self.chosen_backend = self.client.last_response.headers.get("X-Chosen-Backend")
+
+#         # Handle streaming response
+#         if stream:
+#             print_purple("STREAMING")
+#             return self.read_stream(response)
+#         else:
+#             # Process the response
+#             if isinstance(response, ChatResponse):
+#                 result = response.message.content.strip('"')
+#                 if '</think>' in result:
+#                     result = result.split('</think>')[-1]
+#                 self.messages.append(
+#                     {"role": "assistant", "content": result.strip('"')}
+#                 )
+#                 if tools and not response.message.get("tool_calls"):
+#                     print_yellow("No tool calls in response".upper())
+#                 if not self.chat:
+#                     self.messages = [self.messages[0]]
+
+#                 if not think:
+#                     response.message.content = remove_thinking(response.message.content)
+#                 return response.message
+#             else:
+#                 print_red("Unexpected response type")
+#                 return "An error occurred."
+
+#     def make_summary(self, text):
+#         # Implement your summary logic using self.client.chat()
+#         summary_message = {
+#             "role": "user",
+#             "content": f'Summarize the text below:\n"""{text}"""\nRemember to be concise and detailed. Answer in English.',
+#         }
+#         messages = [
+#             {
+#                 "role": "system",
+#                 "content": "You are summarizing a text. Make it detailed and concise. Answer ONLY with the summary. Don't add any new information.",
+#             },
+#             summary_message,
+#         ]
+#         try:
+#             response = self.client.chat(
+#                 model=self.get_model("small"),
+#                 messages=messages,
+#                 options=Options(temperature=0.01),
+#                 keep_alive=3600 * 24 * 7,
+#             )
+#             summary = response.message.content.strip()
+#             print_blue("Summary:", summary)
+#             return summary
+#         except ResponseError as e:
+#             print_red("Error generating summary:", e)
+#             return "Summary generation failed."
+
+#     def read_stream(self, response):
+#         """
+#         Yields tuples of (chunk_type, text). The first tuple is ('thinking', ...)
+#         if in_thinking is True and stops at </think>. After that, yields ('normal', ...)
+#         for the rest of the text.
+#         """
+#         thinking_buffer = ""
+#         in_thinking = self.call_model == self.get_model("reasoning")
+#         first_chunk = True
+#         prev_content = None
+
+#         for chunk in response:
+#             if not chunk:
+#                 continue
+#             content = chunk.message.content
+
+#             # Remove leading quote if it's the first chunk
+#             if first_chunk and content.startswith('"'):
+#                 content = content[1:]
+#             first_chunk = False
+
+#             if in_thinking:
+#                 thinking_buffer += content
+#                 if "</think>" in thinking_buffer:
+#                     end_idx = thinking_buffer.index("</think>") + len("</think>")
+#                     yield ("thinking", thinking_buffer[:end_idx])
+#                     remaining = thinking_buffer[end_idx:].strip('"')
+#                     if chunk.done and remaining:
+#                         yield ("normal", remaining)
+#                         break
+#                     else:
+#                         prev_content = remaining
+#                     in_thinking = False
+#             else:
+#                 if prev_content:
+#                     yield ("normal", prev_content)
+#                 prev_content = content
+
+#             if chunk.done:
+#                 if prev_content and prev_content.endswith('"'):
+#                     prev_content = prev_content[:-1]
+#                 if prev_content:
+#                     yield ("normal", prev_content)
+#                 break
+
+#         self.messages.append({"role": "assistant", "content": ""})
+
+#     async def async_generate(
+#         self,
+#         query: str = None,
+#         user_input: str = None,
+#         context: str = None,
+#         stream: bool = False,
+#         tools: list = None,
+#         images: list = None,
+#         model: Optional[Literal["small", "standard", "vision"]] = None,
+#         temperature: float = None,
+#     ):
+#         """
+#         Asynchronously generates a response based on the provided query and other parameters.
+
+#         Args:
+#             query (str, optional): The query string to generate a response for.
+#             user_input (str, optional): Additional user input to be included in the response.
+#             context (str, optional): Context information to be used in generating the response.
+#             stream (bool, optional): Whether to stream the response. Defaults to False.
+#             tools (list, optional): List of tools to be used in generating the response. Will set the model to 'tools'.
+#             images (list, optional): List of images to be included in the response.
+#             model (Optional[Literal["small", "standard", "vision", "tools"]], optional): The model to be used for generating the response.
+#             temperature (float, optional): The temperature setting for the model.
+
+#         Returns:
+#             str: The generated response or an error message if an exception occurs.
+
+#         Raises:
+#             ResponseError: If an error occurs during the response generation.
+
+#         Notes:
+#             - The function prepares the model and temperature settings.
+#             - It normalizes whitespace in the query and handles images if provided.
+#             - It prepares headers and options for the request.
+#             - It adjusts options for long messages and calls the async client's chat method.
+#             - If user_input is provided, it updates the last message.
+#             - It updates the chosen backend based on the response headers.
+#             - It handles streaming responses and processes the response accordingly.
+#             - It's not neccecary to set model to 'tools' if you provide tools as an argument.
+#         """
+#         print_yellow("ASYNC GENERATE")
+#         # Normaliz e whitespace and add the query to the messages
+#         query = re.sub(r"\s*\n\s*", "\n", query)
+#         message = {"role": "user", "content": query}
+#         self.messages.append(message)
+
+#         # Prepare the model and temperature
+#         model = self.get_model(model) if model else self.model
+#         temperature = temperature if temperature else self.options["temperature"]
+
+#         # Prepare options
+#         options = Options(**self.options)
+#         options.temperature = temperature
+
+#         # Prepare headers
+#         headers = {}
+
+#         # Set model depending on the input
+#         if images:
+#             message = self.prepare_images(images, message)
+#             model = self.get_model("vision")
+#         elif tools:
+#             model = self.get_model("tools")
+#             headers["X-Model-Type"] = "tools"
+#             tools = [Tool(**tool) if isinstance(tool, dict) else tool for tool in tools]
+#         elif self.chosen_backend and model not in [self.get_model("vision"), self.get_model("tools"), self.get_model("reasoning")]:
+#             headers["X-Chosen-Backend"] = self.chosen_backend
+#         elif model == self.get_model("small"):
+#             headers["X-Model-Type"] = "small"
+
+#         # Adjust options for long messages
+#         if self.chat or len(self.messages) > 15000:
+#             num_tokens = self.count_tokens() + self.max_length_answer // 2
+#             if num_tokens > 8000 and model not in [
+#                 self.get_model("vision"),
+#                 self.get_model("tools"),
+#             ]:
+#                 model = self.get_model("standard_64k")
+#                 headers["X-Model-Type"] = "large"
+
+#         # Call the async client's chat method
+#         try:
+#             response = await self.async_client.chat(
+#                 model=model,
+#                 messages=self.messages,
+#                 headers=headers,
+#                 tools=tools,
+#                 stream=stream,
+#                 options=options,
+#                 keep_alive=3600 * 24 * 7,
+#             )
+#         except ResponseError as e:
+#             print_red("Error!")
+#             print(e)
+#             return "An error occurred."
+
+#         # If user_input is provided, update the last message
+#         if user_input:
+#             if context:
+#                 if len(context) > 2000:
+#                     context = self.make_summary(context)
+#                 user_input = (
+#                     f"{user_input}\n\nUse the information below to answer the question.\n"
+#                     f'"""{context}"""\n[This is a summary of the context provided in the original message.]'
+#                 )
+#                 system_message_info = "\nSometimes some of the messages in the chat history are summarised, then that is clearly indicated in the message."
+#                 if system_message_info not in self.messages[0]["content"]:
+#                     self.messages[0]["content"] += system_message_info
+#             self.messages[-1] = {"role": "user", "content": user_input}
+
+#         print_red(self.async_client.last_response.headers.get("X-Chosen-Backend", "No backend"))
+#         # Update chosen_backend
+#         if model not in [self.get_model("vision"), self.get_model("tools"), self.get_model("reasoning")]:
+#             self.chosen_backend = self.async_client.last_response.headers.get(
+#                 "X-Chosen-Backend"
+#             )
+
+#         # Handle streaming response
+#         if stream:
+#             return self.read_stream(response)
+#         else:
+#             # Process the response
+#             if isinstance(response, ChatResponse):
+#                 result = response.message.content.strip('"')
+#                 self.messages.append(
+#                     {"role": "assistant", "content": result.strip('"')}
+#                 )
+#                 if tools and not response.message.get("tool_calls"):
+#                     print_yellow("No tool calls in response".upper())
+#                 if not self.chat:
+#                     self.messages = [self.messages[0]]
+#                 return result
+#             else:
+#                 print_red("Unexpected response type")
+#                 return "An error occurred."
+
+#     def prepare_images(self, images, message):
+#         """
+#         Prepares a list of images by converting them to base64 encoded strings and adds them to the provided message dictionary.
+#         Args:
+#             images (list): A list of images, where each image can be a file path (str), a base64 encoded string (str), or bytes.
+#             message (dict): A dictionary to which the base64 encoded images will be added under the key "images".
+#         Returns:
+#             dict: The updated message dictionary with the base64 encoded images added under the key "images".
+#         Raises:
+#             ValueError: If an image is not a string or bytes.
+#         """
+#         import base64
+
+#         base64_images = []
+#         base64_pattern = re.compile(r"^[A-Za-z0-9+/]+={0,2}$")
+
+#         for image in images:
+#             if isinstance(image, str):
+#                 if base64_pattern.match(image):
+#                     base64_images.append(image)
+#                 else:
+#                     with open(image, "rb") as image_file:
+#                         base64_images.append(
+#                             base64.b64encode(image_file.read()).decode("utf-8")
+#                         )
+#             elif isinstance(image, bytes):
+#                 base64_images.append(base64.b64encode(image).decode("utf-8"))
+#             else:
+#                 print_red("Invalid image type")
+
+#             message["images"] = base64_images
+#             # Use the vision model
+
+#             return message
+
+# def remove_thinking(response):
+#     """Remove the thinking section from the response"""
+#     response_text = response.content if hasattr(response, "content") else str(response)
+#     if "</think>" in response_text:
+#         return response_text.split("</think>")[1].strip()
+#     return response_text
+    
+# if __name__ == "__main__":
+
+#     llm = LLM()
+
+#     result = llm.generate(
+#         query="I want to add 2 and 2",
+#     )
+#     print(result.content)
--- a/agent_research.py
+++ b/agent_research.py
--- a/article2db.py
+++ b/article2db.py
@ -18,13 +18,14 @@ import xml.etree.ElementTree as ET
 from streamlit.runtime.uploaded_file_manager import UploadedFile
 import streamlit as st

-from _arango import ArangoDB
+from _arango import ArangoDB, COLLECTIONS_IN_BASE
 from _chromadb import ChromaDB
 from _llm import LLM
 from colorprinter.print_color import *
-from utils import fix_key
+from utils import fix_key, is_reference_chunk
 import semantic_schoolar

+from models import ArticleMetadataResponse

 class Document:
    def __init__(
@ -39,6 +40,7 @@ class Document:
        _key: str = None,
        arango_db_name: str = None,
        arango_collection: str = None,
+        arango_doc: dict = None
    ):
        self.filename = filename
        self.pdf_file = pdf_file
@ -50,6 +52,7 @@ class Document:
        self.arango_db_name = arango_db_name
        self.arango_collection = arango_collection
        self.text = text
+        self.arango_doc: dict = arango_doc

        self.chunks = []
        self.pdf = None
@ -61,6 +64,8 @@ class Document:
        self.download_folder = None
        self.document_type = None

+        if self._key:
+            self._key = fix_key(self._key)
        if self.pdf_file:
            self.open_pdf(self.pdf_file)

@ -71,9 +76,8 @@ class Document:
        if not self._id:
            return
        data = {
-            "text": self.text,
+            "arango_doc": self.arango_doc,
            "arango_db_name": self.arango_db_name,
-            "arango_id": self._id,
            "is_sci": self.is_sci,
        }

@ -132,7 +136,13 @@ class Document:
            else:
                better_chunks.append(chunk.strip())

-        self.chunks = better_chunks
+        # Check if the chunk is mainly academic references
+        chunks = []
+        for chunk in better_chunks:
+            if not is_reference_chunk(chunk):
+                self.chunks.append(chunk)
+            else:
+                print_yellow(f"Chunk is mainly academic references, skipping it.\n{chunk[:100]}...")

    def get_title(self, only_meta=False):
        """
@ -238,7 +248,84 @@ class Document:


 class Processor:
+    """
+    Processor class for handling scientific and non-scientific document ingestion, metadata extraction, and storage.
+    This class provides a comprehensive pipeline for processing documents (primarily PDFs), extracting metadata (such as DOI, title, authors, journal, etc.), verifying and enriching metadata using external APIs (CrossRef, Semantic Scholar, DOAJ), chunking document text, and storing both the document and its chunks in vector and document databases (ChromaDB and ArangoDB).
+    Key Features:
+    -------------
+    - Extracts DOI from filenames and document text using regex and LLM fallback.
+    - Retrieves and verifies metadata from CrossRef, Semantic Scholar, and DOAJ.
+    - Handles both scientific articles and other document types, with appropriate collection routing.
+    - Chunks document text for vector storage and search.
+    - Stores documents and chunks in ArangoDB (document DB) and ChromaDB (vector DB).
+    - Manages user access and open access flags.
+    - Supports background summary generation for scientific articles.
+    - Provides PDF download utilities from open access sources.
+    - Designed for extensibility and robust error handling.
+    Parameters:
+    -----------
+    document : Document
+        The document object to be processed.
+    filename : str, optional
+        The filename of the document (default: None).
+    chroma_db : str, optional
+        Name of the ChromaDB database to use (default: "sci_articles").
+    len_chunks : int, optional
+        Length of text chunks for vector storage (default: 2200).
+    local_chroma_deployment : bool, optional
+        Whether to use a local ChromaDB deployment (default: False).
+    process : bool, optional
+        Whether to immediately process the document upon initialization (default: True).
+    document_type : str, optional
+        Type of the document for collection routing (default: None).
+    username : str, optional
+        Username for access control and database routing (default: None).
+    Methods:
+    get_arango(db_name=None, document_type=None)
+    extract_doi(text, multi=False)
+        Extract DOI(s) from text using regex and LLM fallback.
+    chunks2chroma(_id, key)
+        Add document chunks to ChromaDB vector database.
+    chunks2arango()
+        Add document chunks and metadata to ArangoDB document database.
+    llm2metadata()
+        Extract metadata from a scientific article using an LLM.
+    get_crossref(doi)
+        Retrieve and parse metadata from CrossRef by DOI.
+    check_doaj(doi)
+        Check if a DOI is listed in DOAJ and retrieve metadata.
+    get_semantic_scholar_by_doi(doi)
+        Retrieve and verify metadata from Semantic Scholar by DOI.
+    get_semantic_scholar_by_title(title)
+        Retrieve and verify metadata from Semantic Scholar by title.
+    process_document()
+        Main pipeline for processing, extracting, chunking, and storing the document.
+    dl_pyppeteer(doi, url)
+        Download a PDF using a headless browser (async).
+    doi2pdf(doi)
+        Download a PDF for a DOI from open access sources or retrieve from database.
+    Attributes:
+    -----------
+    document : Document
+        The document being processed.
+    chromadb : ChromaDB
+        The ChromaDB instance for vector storage.
+    len_chunks : int
+        Length of text chunks for vector storage.
+    document_type : str
+        Type of the document for collection routing.
+    filename : str
+        Filename of the document.
+    username : str
+        Username for access control and database routing.
+    _id : str
+        Internal document ID after processing.
+    Usage:
+    ------
+    processor = Processor(document, filename="paper.pdf")
+    """
    def __init__(
+        
        self,
        document: Document,
        filename: str = None,
@ -249,6 +336,31 @@ class Processor:
        document_type: str = None,
        username: str = None,
    ):
+        """
+        Initializes the class with the provided document and configuration parameters.
+
+        Args:
+            document (Document): The document object to be processed and stored.
+            filename (str, optional): The filename associated with the document. Defaults to None.
+            chroma_db (str, optional): The name of the ChromaDB database to use. Defaults to "sci_articles".
+            len_chunks (int, optional): The length of text chunks for processing. Defaults to 2200.
+            local_chroma_deployment (bool, optional): Whether to use a local ChromaDB deployment. Defaults to False.
+            process (bool, optional): Whether to process the document upon initialization. Defaults to True.
+            document_type (str, optional): The type/category of the document. Defaults to None.
+            username (str, optional): The username associated with the document. If not provided, uses document.username. Defaults to None.
+
+        Attributes:
+            document (Document): The document object.
+            chromadb (ChromaDB): The ChromaDB instance for database operations.
+            len_chunks (int): The length of text chunks for processing.
+            document_type (str): The type/category of the document.
+            filename (str): The filename associated with the document.
+            username (str): The username associated with the document.
+            _id: Internal identifier for the document.
+
+        Side Effects:
+            If process is True, calls self.process_document() to process the document.
+        """
        self.document = document
        self.chromadb = ChromaDB(local_deployment=local_chroma_deployment, db=chroma_db)
        self.len_chunks = len_chunks
@ -258,28 +370,47 @@ class Processor:
        self.username = username if username else document.username

        self._id = None
+        self._key = None

        if process:
            self.process_document()

    def get_arango(self, db_name=None, document_type=None):
-        if db_name and document_type:
-            arango = ArangoDB(db_name=db_name)
-            arango_collection = arango.db.collection(document_type)
+        """
+        Get an ArangoDB collection based on document type and context.
+
+        This method determines the appropriate ArangoDB collection to use based on the
+        document type and the document's properties.
+
+        Args:
+            db_name (str, optional): The name of the database to connect to.
+                Defaults to None, in which case the default database is used.
+            document_type (str, optional): The type of document, which maps to a collection name.
+                Defaults to None, in which case the method attempts to determine the appropriate collection.
+
+        Returns:
+            Collection: An ArangoDB collection object.
+
+        Raises:
+            AssertionError: If document_type is not provided for non-sci articles, or
+                            if username is not provided for non-sci articles.
+
+        Notes:
+            - For document types in COLLECTIONS_IN_BASE, returns the corresponding collection.
+            - For scientific articles (document.is_sci == True), returns the "sci_articles" collection.
+            - For other documents, requires both document_type and document.username to be specified.
+        """
+
+        if document_type in COLLECTIONS_IN_BASE:
+            return ArangoDB().get_collection(document_type)
        elif self.document.is_sci:
-            arango = ArangoDB(db_name="base")
-            arango_collection = arango.db.collection("sci_articles")
-        elif self.document.open_access:
-            arango = ArangoDB(db_name="base")
-            arango_collection = arango.db.collection("other_documents")
+            return ArangoDB().get_collection("sci_articles")
        else:
-            arango = ArangoDB(db_name=self.document.username)
-            arango_collection: ArangoCollection = arango.db.collection(
-                self.document_type
-            )
-        self.document.arango_db_name = arango.db.name
-        self.arango_collection = arango_collection
-        return arango_collection
+            assert document_type, "Document type must be provided for non-sci articles." 
+            assert self.document.username, "Username must be provided for non-sci articles."
+            if self.document.username:
+                return ArangoDB(db_name=self.document.username).get_collection(document_type)
+

    def extract_doi(self, text, multi=False):
        """
@ -360,7 +491,7 @@ class Processor:
            ids.append(id)

            metadata = {
-                "_key": id,
+                "_key": self.document._key,
                "file": self.document.file_path,
                "chunk_nr": i,
                "pages": ",".join([str(i) for i in page_numbers]),
@ -378,6 +509,11 @@ class Processor:
                "sci_articles"
            )
        else:
+            print('collection name'.upper(), f"{self.username}__other_documents")
+            print_yellow(self.chromadb.db.list_collections())
+            print(self.chromadb.db.database)
+            print('VERSION', self.chromadb.db.get_version)
+            print('CHROMA DB', self.chromadb.db)
            chroma_collection = self.chromadb.db.get_or_create_collection(
                f"{self.username}__other_documents"
            )
@ -385,6 +521,31 @@ class Processor:
        chroma_collection.add(ids=ids, documents=documents, metadatas=metadatas)

    def chunks2arango(self):
+        """
+        Adds document chunks to an ArangoDB database.
+        
+        This method processes the document and its chunks to store them in the ArangoDB.
+        It handles scientific and non-scientific documents differently, applies access control,
+        and manages document metadata.
+        
+        Prerequisites:
+            - Document must have a 'text' attribute
+            - Scientific documents must have 'doi' and 'metadata' attributes
+            - Non-scientific documents must have either '_key' attribute or DOI
+        
+        The method:
+        1. Validates document attributes
+        2. Gets ArangoDB collection
+        3. Processes document chunks with page information
+        4. Manages user access permissions
+        5. Creates the ArangoDB document with all necessary fields
+        6. Handles special processing for scientific documents with abstracts
+        7. Inserts the document into ArangoDB with update capabilities
+        8. Initiates background summary generation if needed
+        
+        Returns:
+            tuple: A tuple containing (document_id, document_key)
+        """
        st.write("Adding to document database...")
        assert self.document.text, "Document must have 'text' attribute."
        if self.document.is_sci:
@ -397,7 +558,7 @@ class Processor:
                getattr(self.document, "_key", None) or self.document.doi
            ), "Document must have '_key' attribute or DOI."

-        arango_collection = self.get_arango()
+        arango_collection = self.get_arango(document_type=self.document.arango_collection)

        if self.document.doi:
            key = self.document.doi
@ -435,7 +596,7 @@ class Processor:
        if self.document.open_access:
            user_access = None

-        arango_document = {
+        self.document.arango_doc = {
            "_key": fix_key(self.document._key),
            "file": self.document.file_path,
            "chunks": arango_chunks,
@ -446,6 +607,7 @@ class Processor:
            "metadata": self.document.metadata,
            "filename": self.document.filename,
        }
+        print_purple('Number of chunks:', len(self.document.arango_doc['chunks']))

        if self.document.metadata and self.document.is_sci:
            if "abstract" in self.document.metadata:
@ -453,8 +615,8 @@ class Processor:
                    self.document.metadata["abstract"] = re.sub(
                        r"<[^>]*>", "", self.document.metadata["abstract"]
                    )
-                    arango_document["metadata"] = self.document.metadata
-                    arango_document["summary"] = {
+                    self.document.arango_doc["metadata"] = self.document.metadata
+                    self.document.arango_doc["summary"] = {
                        "text_sum": (
                            self.document.metadata["abstract"]["text_sum"]
                            if "text_sum" in self.document.metadata["abstract"]
@ -463,20 +625,49 @@ class Processor:
                        "meta": {"model": "from_metadata"},
                    }

-            arango_document["crossref"] = True
+            self.document.arango_doc["crossref"] = True

-        doc = arango_collection.insert(
-            arango_document, overwrite=True, overwrite_mode="update", keep_none=False
+        arango = ArangoDB(db_name=self.document.arango_db_name)
+        print_purple(self.document.arango_collection, self.document.arango_db_name)
+        inserted_document = arango.insert_document(
+            collection_name=self.document.arango_collection,
+            document=self.document.arango_doc,
+            overwrite=True,
+            overwrite_mode="update",
+            keep_none=False
        )
-        self.document._id = doc["_id"]
+        print_green("ArangoDB document inserted:", inserted_document['_id'])
+
+        self.document.arango_doc = arango.db.collection(
+            self.document.arango_collection
+        ).get(self.document._key)
+        self.document._id = self.document.arango_doc["_id"]

-        if "summary" not in arango_document:
+        if "summary" not in self.document.arango_doc:
            # Make a summary in the background
+            print_yellow("No summary found in the document, generating in background...")
+            print_rainbow(self.document.arango_doc['chunks'])
            self.document.make_summary_in_background()
-
-        return doc["_id"], key
+        else:
+            print_green("Summary already exists in the document.")
+            print(self.document.arango_doc['summary'])
+        return self.document.arango_doc

    def llm2metadata(self):
+        """        
+        Extract metadata from a scientific article PDF using a LLM.
+        Uses the first page (or first two pages for multi-page documents) of the PDF
+        to extract the title, publication date, and journal name via LLM.
+        Returns:
+            dict: A dictionary containing the extracted metadata with the following keys:
+                - "title": The article title (str)
+                - "published_date": The publication date (str)
+                - "journal": The journal name (str)
+                - "published_year": The publication year (int or None if not parseable)
+        Note:
+            Default values are provided for any metadata that cannot be extracted.
+            The published_year is extracted from published_date when possible.
+        """
        st.write("Extracting metadata using LLM...")
        llm = LLM(
            temperature=0.01,
@ -499,38 +690,27 @@ class Processor:
            """

            Answer ONLY with the information requested.
-            I want to know the published date on the form "YYYY-MM-DD".
-            I want the full title of the article.
-            I want the name of the journal/paper/outlet where the article was published.
-            Be sure to answer on the form "published_date;title;journal" as the answer will be used in a CSV.
-            If you can't find the information, answer "not_found".
            '''
-        result = llm.generate(prompt)
-        print_blue(result)
-        if result == "not_found":
-            return None
-        else:
-            parts = result.content.split(";", 2)
-            if len(parts) != 3:
-                return None
-            published_date, title, journal = parts
-            if published_date == "not_found":
-                published_date = "[Unknown date]"
-            else:
-                try:
-                    published_year = int(published_date.split("-")[0])
-                except:
-                    published_year = None
-            if title == "not_found":
-                title = "[Unknown title]"
-            if journal == "not_found":
-                journal = "[Unknown publication]"
-            return {
-                "published_date": published_date,
-                "published_year": published_year,
-                "title": title,
-                "journal": journal,
-            }
+        result = llm.generate(prompt, format=ArticleMetadataResponse.model_json_schema())
+        structured_response = ArticleMetadataResponse.model_validate_json(result.content)
+        
+        # Extract and process metadata with defaults and safer type conversion
+        metadata = {
+            "title": structured_response.title or "[Unknown title]",
+            "published_date": structured_response.published_date or "[Unknown date]",
+            "journal": structured_response.journal or "[Unknown publication]",
+            "published_year": None
+        }
+        
+        # Parse year from date if available
+        if metadata["published_date"] and metadata["published_date"] != "[Unknown date]":
+            try:
+                metadata["published_year"] = int(metadata["published_date"].split("-")[0])
+            except (ValueError, IndexError):
+                pass
+        
+        # Now you can use metadata dictionary instead of separate variables
+        return metadata

    def get_crossref(self, doi):
        try:
@ -903,7 +1083,7 @@ class Processor:
        assert self.document.pdf_file or self.document.pdf, "PDF file must be provided."
        if not self.document.pdf:
            self.document.open_pdf(self.document.pdf_file)
-    
+
        if self.document.is_image:
            return pymupdf4llm.to_markdown(
                self.document.pdf, page_chunks=False, show_progress=False
@ -940,11 +1120,10 @@ class Processor:
        if not self.document.metadata and self.document.title:
            self.document.metadata = self.get_semantic_scholar_by_title(self.document.title)

-    
-        # Continue with the rest of the method...
-        arango_collection = self.get_arango()
-        
-        # ... rest of the method remains the same ...
+        if self.document.is_sci:
+            arango_collection = self.get_arango(document_type='sci_articles')
+        else:
+            arango_collection = self.get_arango(document_type='other_documents')

        doc = arango_collection.get(self.document._key) if self.document.doi else None

@ -975,6 +1154,7 @@ class Processor:
            arango_collection.update(self.document.doc)
            return doc["_id"], arango_collection.db_name, self.document.doi

+        # If no document found, create a new one
        else:
            self.document.doc = (
                {"doi": self.document.doi, "_key": fix_key(self.document.doi)}
@ -1021,7 +1201,8 @@ class Processor:
                print_yellow(f"Document key: {_key}")
                print(self.document.doi, self.document.title, self.document.get_title())
                self.document.doc["_key"] = fix_key(_key)
-                self.document._key = fix_key(_key)
+                self.document._key = self.document.doc["_key"]
+
        self.document.metadata = self.document.doc["metadata"]
        if not self.document.text:
            self.document.extract_text()
@ -1035,8 +1216,16 @@ class Processor:

        self.document.make_chunks()

-        _id, key = self.chunks2arango()
-        self.chunks2chroma(_id=_id, key=key)
+        if not self.document.is_sci and not self.document.doi:
+            self.document.arango_collection = "other_documents"
+            self.document.arango_db_name = self.username
+
+            print_purple("Not a scientific article, using 'other_articles' collection.")
+
+        arango_doc = self.chunks2arango()
+        _id = arango_doc["_id"]
+        _key = arango_doc["_key"]
+        self.chunks2chroma(_id=_id, key=_key)

        self._id = _id
        return _id, arango_collection.db_name, self.document.doi
@ -1224,6 +1413,8 @@ class PDFProcessor(Processor):
            return False, None, None, False


+
+
 if __name__ == "__main__":
    doi = "10.1007/s10584-019-02646-9"
    print(f"Processing article with DOI: {doi}")
--- a/bot_tools.py
+++ b/bot_tools.py
--- a/info.py
+++ b/info.py
@ -190,3 +190,4 @@ country_emojis = {
    "ro": "🇷🇴",
    "rs": "🇷🇸",
 }
+
--- a/llm_queries.py
+++ b/llm_queries.py
@ -80,6 +80,11 @@ def create_plan(agent, question):
        '''

        The example above is just an example, you can use other steps and tasks that are more relevant for the question.
+
+        Again: The research will be done in a restricted context, with only the available sources and tools. Therefore:
+        - DO NOT include any steps that require access to the internet or external databases. 
+        - DO NOT include any steps that require cross-referencing sources. 
+        - DO NOT include any steps to find new sources or tools.
        """

        return query
--- a/llm_server.py
+++ b/llm_server.py
@ -1,26 +1,223 @@
 from fastapi import FastAPI, BackgroundTasks, Request
-from fastapi.responses import JSONResponse
+from fastapi.responses import JSONResponse, HTMLResponse
 import logging
+from datetime import datetime
+import json
+import os
+from typing import Dict, Any

 from prompts import get_summary_prompt
 from _llm import LLM
 from _arango import ArangoDB
+from models import ArticleChunk
+from _chromadb import ChromaDB
+

 app = FastAPI()

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

+# Storage for the latest processed document
+latest_result: Dict[str, Any] = {}
+latest_result_file = os.path.join(os.path.dirname(__file__), "latest_summary_result.json")
+
+# Load any previously saved result on startup
+try:
+    if os.path.exists(latest_result_file):
+        with open(latest_result_file, 'r') as f:
+            latest_result = json.load(f)
+        logger.info(f"Loaded previous result from {latest_result_file}")
+except Exception as e:
+    logger.warning(f"Could not load previous result: {e}")
+
+# Function to save the latest result to disk
+def save_latest_result(result: Dict[str, Any]):
+    global latest_result
+    latest_result = result
+    try:
+        # Save sanitized version (remove internal fields if needed)
+        result_to_save = {k: v for k, v in result.items() if not k.startswith('_') or k == '_id'}
+        with open(latest_result_file, 'w') as f:
+            json.dump(result_to_save, f, indent=2)
+        logger.info(f"Saved latest result to {latest_result_file}")
+    except Exception as e:
+        logger.error(f"Error saving latest result: {e}")
+
+# New endpoint to get the latest summarized document
+@app.get("/latest_result")
+async def get_latest_result():
+    """
+    Get the latest summarized document result.
+    
+    Returns the most recently processed document summary and chunk information.
+    If no document has been processed yet, returns an empty object.
+    
+    Returns
+    -------
+    dict
+        The latest processed document with summaries
+    """
+    if not latest_result:
+        return {"message": "No documents have been processed yet"}
+    return latest_result
+
+@app.get("/view_results")
+async def view_results():
+    """
+    View the latest summarization results in a more readable format.
+    
+    Returns a formatted response with document summary and chunks.
+    
+    Returns
+    -------
+    dict
+        A formatted representation of the latest summarized document
+    """
+    if not latest_result:
+        return {"message": "No documents have been processed yet"}
+    
+    # Extract the key information
+    formatted_result = {
+        "document_id": latest_result.get("_id", "Unknown"),
+        "timestamp": datetime.now().isoformat(),
+        "summary": latest_result.get("summary", {}).get("text_sum", "No summary available"),
+        "model": latest_result.get("summary", {}).get("meta", {}).get("model", "Unknown model"),
+    }
+    
+    # Format chunks information if available
+    chunks = latest_result.get("chunks", [])
+    if chunks:
+        formatted_chunks = []
+        for i, chunk in enumerate(chunks):
+            chunk_data = {
+                "chunk_number": i + 1,
+                "summary": chunk.get("summary", "No summary available"),
+                "tags": chunk.get("tags", [])
+            }
+            # Add references for scientific articles if available
+            if "references" in chunk:
+                chunk_data["references"] = chunk.get("references", [])
+            formatted_chunks.append(chunk_data)
+        
+        formatted_result["chunks"] = formatted_chunks
+        formatted_result["chunk_count"] = len(chunks)
+    
+    return formatted_result
+
+@app.get("/html_results", response_class=HTMLResponse)
+async def html_results():
+    """
+    View the latest summarization results in a human-readable HTML format.
+    """
+    if not latest_result:
+        return """
+        <html>
+            <head>
+                <title>No Results Available</title>
+                <style>
+                    body { font-family: Arial, sans-serif; margin: 40px; line-height: 1.6; }
+                </style>
+            </head>
+            <body>
+                <h1>No Documents Have Been Processed Yet</h1>
+                <p>Submit a document for summarization first.</p>
+            </body>
+        </html>
+        """
+    
+    # Get the document ID and summary
+    doc_id = latest_result.get("_id", "Unknown")
+    summary = latest_result.get("summary", {}).get("text_sum", "No summary available")
+    model = latest_result.get("summary", {}).get("meta", {}).get("model", "Unknown model")
+    
+    # Format chunks
+    chunks_html = ""
+    chunks = latest_result.get("chunks", [])
+    for i, chunk in enumerate(chunks):
+        chunk_summary = chunk.get("summary", "No summary available")
+        tags = chunk.get("tags", [])
+        tags_html = ", ".join(tags) if tags else "None"
+        
+        references_html = ""
+        if "references" in chunk and chunk["references"]:
+            references_html = "<h4>References:</h4><ul>"
+            for ref in chunk["references"]:
+                references_html += f"<li>{ref}</li>"
+            references_html += "</ul>"
+        
+        chunks_html += f"""
+        <div class="chunk">
+            <h3>Chunk {i+1}</h3>
+            <div class="chunk-summary">{chunk_summary}</div>
+            <div class="chunk-tags"><strong>Tags:</strong> {tags_html}</div>
+            {references_html}
+        </div>
+        <hr>
+        """
+    
+    html_content = f"""
+    <html>
+        <head>
+            <title>Document Summary: {doc_id}</title>
+            <style>
+                body {{ font-family: Arial, sans-serif; margin: 40px; line-height: 1.6; max-width: 1000px; margin: 0 auto; padding: 20px; }}
+                h1, h2, h3 {{ color: #333; }}
+                .summary {{ background-color: #f9f9f9; padding: 15px; border-left: 4px solid #4CAF50; margin-bottom: 20px; }}
+                .chunk {{ background-color: #f5f5f5; padding: 15px; margin-bottom: 10px; border-radius: 4px; }}
+                .chunk-tags {{ margin-top: 10px; font-style: italic; }}
+                .metadata {{ color: #666; font-size: 0.9em; margin-bottom: 20px; }}
+                hr {{ border: 0; height: 1px; background: #ddd; margin: 20px 0; }}
+                .refresh-button {{ padding: 10px 15px; background-color: #4CAF50; color: white; border: none; cursor: pointer; border-radius: 4px; }}
+                .refresh-button:hover {{ background-color: #45a049; }}
+            </style>
+        </head>
+        <body>
+            <h1>Document Summary</h1>
+            <div class="metadata">
+                <strong>Document ID:</strong> {doc_id}<br>
+                <strong>Model:</strong> {model}<br>
+                <strong>Generated:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+            </div>
+            
+            <h2>Summary</h2>
+            <div class="summary">{summary}</div>
+            
+            <h2>Chunks ({len(chunks)})</h2>
+            {chunks_html}
+            
+            <button class="refresh-button" onclick="window.location.reload()">Refresh Results</button>
+        </body>
+    </html>
+    """
+    
+    return html_content
+
@app.post("/summarise_document")
 async def summarize_document(request: Request, background_tasks: BackgroundTasks):
    try:
        data = await request.json()
        logger.info(f"Received data: {data}")

-        # Clean the data
-        data['text'] = data.get('text', '').strip()
-        data['arango_db_name'] = data.get('arango_db_name', '').strip()
-        data['arango_id'] = data.get('arango_id', '').strip()
+        # Extract arango_id, checking both top-level field and inside arango_doc
+        arango_doc = data.get('arango_doc', {}) or {}
+        arango_id = arango_doc.get('_id', '')
+
+            
+        arango_db_name = data.get('arango_db_name', '').strip()
+        if not arango_db_name:
+            return JSONResponse(
+                status_code=400,
+                content={"detail": "Missing required field: arango_db_name"},
+            )
+
+        print(arango_doc)
+        # Prepare data for processing
+        data['text'] = arango_doc.get('text', '').strip()
+        data['chunks'] = arango_doc.get('chunks', [])
+        data['arango_db_name'] = arango_db_name
+        data['arango_id'] = arango_id
+        data["arango_key"] = arango_doc['_key']
        data['is_sci'] = data.get('is_sci', False)

        background_tasks.add_task(summarise_document_task, data)
@ -29,45 +226,189 @@ async def summarize_document(request: Request, background_tasks: BackgroundTasks
        logger.error(f"Error in summarize_document: {e}")
        return JSONResponse(
            status_code=500,
-            content={"detail": "An unexpected error occurred."},
+            content={"detail": f"An unexpected error occurred: {str(e)}"},
        )

 def summarise_document_task(doc_data: dict):
    try:
-        _id = doc_data.get("arango_id")
-        text = doc_data.get("text")
+        # Get document ID and validate it
+        _id = doc_data.get("arango_id", "")
+        
+        # Validate document ID - it should be in format "collection/key"
+        if not _id or '/' not in _id:
+            logger.error(f"Invalid document ID format: {_id}")
+            return
+            
+        text = doc_data.get("text", "")
        is_sci = doc_data.get("is_sci", False)
+        
+        # Get collection name from document ID
+        collection = _id.split('/')[0]

-        if _id.split('/')[0] == 'interviews':
+        # Set appropriate system message based on document type
+        if collection == 'interviews':
            system_message = "You are summarising interview transcripts. It is very important that you keep to what is written and do not add any of your own opinions or interpretations. Always answer in English."
-        elif is_sci or _id.split('/')[0] == 'sci_articles':
+        elif is_sci or collection == 'sci_articles':
            system_message = "You are summarising scientific articles. It is very important that you keep to what is written and do not add any of your own opinions or interpretations. Always answer in English."
        else:
            system_message = "You are summarising a document. It is very important that you keep to what is written and do not add any of your own opinions or interpretations. Always answer in English."
        
+        # Initialize LLM and generate summary
        llm = LLM(system_message=system_message)
-
+        
+        #if 'abstract'
        prompt = get_summary_prompt(text, is_sci)
-        summary = llm.generate(query=prompt)
+        response = llm.generate(query=prompt)
+        summary = response.content
+

+        # Create summary document
        summary_doc = {
            "text_sum": summary,
            "meta": {
                "model": llm.model,
-                "temperature": llm.options["temperature"],
+                "temperature": llm.options["temperature"] if text else 0,
            },
        }

-        arango = ArangoDB(db_name=doc_data.get("arango_db_name"))
+        # Process chunks if they exist
+        chunks = doc_data.get("chunks", [])
+
+        if chunks:
+            doc_data["chunks"] = summarise_chunks(chunks, is_sci=is_sci)
+        
+        # Get database name and validate it
+        db_name = doc_data.get("arango_db_name")
+        if not db_name:
+            logger.error("Missing database name")
+            return
+            
+        # Update document in ArangoDB
+        arango = ArangoDB(db_name=db_name)
        arango.db.update_document(
-            {"summary": summary_doc, "_id": _id},
+            {"summary": summary_doc, "_id": _id, "chunks": doc_data["chunks"]},
            silent=True,
            check_rev=False,
        )
+        
+        # Update ChromaDB with the new summary
+        chroma = ChromaDB()
+        if db_name == "sci_articles":
+            chroma.add_document(
+                collection="sci_articles_article_summaries",
+                document_id= doc_data["_key"]
+                text=summary_doc["text_sum"],
+                metadata={
+                    "model": summary_doc["meta"]["model"],
+                    "date": datetime.now().strftime("%Y-%m-%d"),
+                    "arango_id": _id,
+                    "arango_db_name": db_name,
+                },
+            )
+
+
+
+        # Save the latest result
+        save_latest_result({"summary": summary_doc, "_id": _id, "chunks": doc_data["chunks"]})
+        logger.info(f"Successfully processed document {_id}")
+
    except Exception as e:
-        logger.error(f'_id: _{id}')
+        # Log error with document ID if available
+        doc_id = doc_data.get("arango_id", "unknown")
+        logger.error(f'Error processing document ID: {doc_id}')
        logger.error(f"Error in summarise_document_task: {e}")

+
+
+def summarise_chunks(chunks: list, is_sci=False):
+    """    
+    Summarize chunks of text in a document using a language model.
+    For each chunk in the document that doesn't already have a summary, this function:
+    1. Generates a summary of the chunk text
+    2. Creates tags for the chunk
+    3. If is_sci=True, extracts scientific references from the chunk
+    Parameters
+    ----------
+    chunks: list
+        A list of dictionaries representing chunks of text from a document.
+    Each chunk should have a "text" field containing the text to summarize.
+    is_sci : bool, default=False
+        If True, uses a scientific article summarization prompt and extracts references.
+        If False, uses a general article summarization prompt.
+    Returns
+    -------
+    list
+        A list of updated chunks containing summaries, tags, and metadata.
+    Raises
+    ------
+    Exception
+        If there's an error processing a chunk.
+    Notes
+    -----
+    - Chunks that already have a "summary" field are skipped.
+    - The function uses an LLM instance with a system prompt tailored to the document type.
+    - The structured response is validated against the ArticleChunk model.
+    """
+    
+    if is_sci:
+        system_message = """You are a science assistant summarizing scientific articles.
+        You will get an article chunk by chunk, and you have three tasks for each chunk:
+        1. Summarize the content of the chunk.
+        2. Tag the chunk with relevant tags.
+        3. Extract the scientific references from the chunk.
+        """
+    else:
+        system_message = """You are a general assistant summarizing articles.
+        You will get an article chunk by chunk, and you have two tasks for each chunk:
+        1. Summarize the content of the chunk.
+        2. Tag the chunk with relevant tags.
+        """
+
+    system_message += """\nPlease make use of the previous chunks you have already seen to understand the current chunk in context and make the summary stand for itself. But remember, *it is the current chunk you are summarizing*
+        ONLY use the information in the chunks to make the summary, and do not add any information that is not in the chunks."""
+
+    llm = LLM(system_message=system_message)
+    new_chunks = []
+    for chunk in chunks:
+        if "summary" in chunk:
+            new_chunks.append(chunk)
+            continue
+        prompt = f"""Summarize the following text to make it stand on its own:\n
+        '''
+        {chunk['text']}
+        '''\n
+        Your tasks are:
+        1. Summarize the content of the chunk. Make sure to include all relevant details!
+        2. Tag the chunk with relevant tags.
+        """
+        if is_sci:
+            prompt += "\n3. Extract the scientific references mentioned in this specific chunk. If there is a DOI reference, include that in the reference. Sometimes the reference is only a number in brackets, like [1], so make sure to include that as well (in brackets)."
+        prompt += "\nONLY use the information in the chunks to make the summary, and do not add any information that is not in the chunks."
+        
+        try:
+            response = llm.generate(prompt, format=ArticleChunk.model_json_schema())
+            structured_response = ArticleChunk.model_validate_json(response.content)
+            chunk["summary"] = structured_response.summary
+            chunk["tags"] = [i.lower() for i in structured_response.tags]
+            
+            # Add references for scientific articles if they exist in the response
+            if is_sci and hasattr(structured_response, 'references') and structured_response.references:
+                chunk["references"] = structured_response.references
+            
+            chunk["summary_meta"] = {
+                "model": llm.model,
+                "date": datetime.now().strftime("%Y-%m-%d"),
+            }
+        except Exception as e:
+            logger.error(f"Error processing chunk: {e}")
+            # Continue processing other chunks even if one fails
+            chunk["summary"] = "Error processing chunk"
+            chunk["tags"] = []
+        new_chunks.append(chunk)
+
+    return new_chunks
+
+    
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8100)
--- a/manage_users.py
+++ b/manage_users.py
@ -54,16 +54,30 @@ def make_arango(username):
    root_password = os.getenv("ARANGO_ROOT_PASSWORD")
    arango = ArangoDB(user=root_user, password=root_password, db_name="_system")

+    # Create the user
+    if not arango.db.has_user(username):
+        user = arango.db.create_user(
+            username,
+            password=os.getenv("ARANGO_PASSWORD"),
+            active=True,
+            extra={},
+        )
+    else:
+        user = arango.db.user(username)
+        user['password'] = os.getenv("ARANGO_PASSWORD")
+        print_rainbow(user)
+
    if not arango.db.has_database(username):
        arango.db.create_database(
-            username,
+            name=user['username'],
            users=[
+                user,
                {
                    "username": os.getenv("ARANGO_USER"),
                    "password": os.getenv("ARANGO_PASSWORD"),
                    "active": True,
                    "extra": {},
-                }
+                },
            ],
        )
    arango = ArangoDB(user=root_user, password=root_password, db_name=username)
@ -145,7 +159,6 @@ def main():

    yaml_file = "streamlit_users.yaml"
    data = read_yaml(yaml_file)
-
    if args.delete:
        if args.user:
            username = args.user
--- a/models.py
+++ b/models.py
@ -0,0 +1,334 @@
+from pydantic import BaseModel, Field
+from typing import Dict, List, Tuple, Optional, Any
+
+class ArticleChunk(BaseModel):
+    summary: str
+    tags: List[str]
+    references: Optional[List[str]]
+
+
+class QueryResponse(BaseModel):
+    """
+    Represents a query generated for retrieving documents from a vector database.
+
+    Attributes:
+        query (str): The generated query text, short and concise.
+    """
+
+    query: str = Field(
+        description="The generated query that will be used to retrieve documents from a vector database (ChromaDB). Should be short and concise.",
+        example="capital of France",
+    )
+
+class ArticleMetadataResponse(BaseModel):
+    """
+    Represents structured metadata extracted from an article by an LLM.
+    """
+    published_date: Optional[str] = Field(
+        description="The publication date of the article in YYYY-MM-DD format."
+    )
+    title: str = Field(
+        description="The full title of the article."
+    )
+    journal: Optional[str] = Field(
+        description="The name of the journal/paper/outlet where the article was published."
+    )
+
+
+class PlanEvaluationResponse(BaseModel):
+    """
+    Represents the evaluation of a plan's step.
+
+    Attributes:
+        reasoning (str): Explanation of the reasoning behind the evaluation.
+        complete (bool): Indicates if the step has sufficient information to proceed.
+    """
+
+    reasoning: str = Field(
+        description="A short explanation of the reasoning behind the evaluation",
+        example="Although some information is missing, the existing data is sufficient to complete the step.",
+    )
+    complete: bool = Field(
+        description="Indicates whether the information is sufficient to complete the step",
+        example=False,
+    )
+
+
+class EvaluateFormat(BaseModel):
+    """
+    Represents the evaluation format for determining sufficiency of information.
+
+    Attributes:
+        explanation (str): Explanation of whether the information is sufficient.
+        status (bool): Indicates sufficiency of the information.
+        additional_info (Optional[str]): Additional information needed if insufficient.
+    """
+
+    explanation: str = Field(
+        description="A very short explanation of whether the information is sufficient or not",
+        example="The information is sufficient because...",
+    )
+    status: bool = Field(
+        description="If the information is sufficient to complete the step or not.",
+        example=True,
+    )
+    additional_info: Optional[str] = Field(
+        description="If the information is not sufficient, what additional information would be needed",
+        example="We need more information about...",
+    )
+
+
+class Plan(BaseModel):
+    """
+    Represents a structured plan with steps and corresponding tasks or facts.
+
+    Attributes:
+        steps (Dict[str, List[Tuple[str, str]]]): A dictionary where keys are step names and values are lists of tasks or facts.
+    """
+
+    steps: Dict[str, List[Tuple[str, str]]] = Field(
+        description="Structured plan represented as steps with their corresponding tasks or facts",
+        example={
+            "Step 1: Gather Existing Materials": [
+                ("Task 1", "Description of task"),
+                ("Task 2", "Description of task"),
+            ],
+            "Step 2: Extract Relevant Information": [
+                ("Task 1", "Description of task"),
+                ("Task 2", "Description of task"),
+            ],
+        },
+    )
+
+
+class ChunkMetadata(BaseModel):
+    """
+    Metadata associated with a document chunk.
+
+    Attributes:
+        title (str): Title of the document chunk.
+        journal (Optional[str]): Journal where the document was published.
+        published_date (Optional[str]): Date of publication.
+        user_notes (Optional[str]): User-provided notes.
+        arango_id (Optional[str]): Unique identifier for the document in ArangoDB.
+        additional_metadata (Dict[str, Any]): Any additional metadata fields.
+        doi (Optional[str]): Digital Object Identifier for the document.
+        link: (Optional[str]): URL to access the document.
+        authors (Optional[List[str]]): List of authors of the document.
+        published_year (Optional[int]): Year of publication.
+        abstract: (Optional[str]): Abstract of the document.
+        pages: (Optional[str]): Page numbers of the document.
+        chroma_id (Optional[str]): Unique identifier for the chunk in ChromaDB.
+    """
+
+    title: str = Field(default="No title", description="Title of the document chunk.")
+    journal: Optional[str] = None
+    published_date: Optional[str] = None
+    user_notes: Optional[str] = None
+    _id: Optional[str] = None
+    additional_metadata: Dict[str, Any] = Field(default_factory=dict)
+    doi: Optional[str] = None
+    link: Optional[str] = None
+    authors: Optional[List[str]] = Field(
+        default_factory=list,
+        description="List of authors of the document.",
+    )
+    published_year: Optional[int] = Field(
+        default=None,
+        description="Year of publication.",
+    )
+    abstract: Optional[str] = Field(
+        default=None,
+        description="Abstract of the document.",
+    )
+    pages: Optional[str] = Field(
+        default=None,
+        description="Page numbers of the document.",
+    )
+    chroma_id: Optional[str] = Field(
+        default=None,
+        description="Unique identifier for the chunk in ChromaDB.",
+    )
+
+
+class DocumentChunk(BaseModel):
+    """
+    Represents a chunk of text from a document with its metadata.
+
+    Attributes:
+        document (str): The text content of the chunk.
+        metadata (ChunkMetadata): Metadata associated with the chunk.
+    """
+
+    document: str
+    metadata: ChunkMetadata
+
+
+
+
+class UnifiedDataChunk(BaseModel):
+    """
+    Represents a unified chunk of data from any source.
+
+    Attributes:
+        content (str): The main content of the chunk (e.g., text, note, or document).
+        metadata (Optional[Dict[str, Any]]): Metadata associated with the chunk.
+        source_type (str): The type of source (e.g., 'note', 'article', 'document').
+    """
+
+    content: str = Field(
+        description="The main content of the chunk (e.g., text, note, or document)."
+    )
+    metadata: Optional[ChunkMetadata] = Field(
+        description="Metadata associated with the chunk (e.g., title, source, date).",
+    )
+    source_type: str = Field(
+        description="The type of source (e.g., 'note', 'article', 'document')."
+    )
+
+
+class UnifiedSearchResults(BaseModel):
+    """
+    Represents unified search results from any search tool.
+
+    Attributes:
+        chunks (List[UnifiedDataChunk]): List of data chunks from the search.
+        source_ids (List[str]): List of unique source IDs for the chunks.
+    """
+
+    chunks: List[UnifiedDataChunk] = Field(
+        description="List of data chunks from the search."
+    )
+    source_ids: List[str] = Field(
+        default_factory=list, description="List of unique source IDs for the chunks."
+    )
+
+
+class UnifiedToolResponse(BaseModel):
+    """
+    Represents a unified response from any tool.
+
+    Attributes:
+        search_results (Optional[UnifiedSearchResults]): The unified search results, if the tool used is returning search results.
+        text_result (Optional[str]): Text result from the tool, e.g., if the tool is an analysis.
+        tool_name (str): The name of the tool used to generate the response.
+    """
+
+    search_results: Optional[UnifiedSearchResults] = Field(
+        default=None,
+        description="The unified search results, if the tools used is returning search results.",
+    )
+    text_results: Optional[list[str]] = Field(
+        default=None,
+        description="Text results from the tool, e.g., if the tool is an analysis.",
+    )
+    tool_names: Optional[list[str]] = Field(
+        default=None, description="The name of the tool used to generate the response."
+    )
+
+    def extend_search_results(self, search_results: UnifiedSearchResults) -> None:
+        """
+        Extends the search results with additional data.
+
+        Args:
+            search_results (UnifiedSearchResults): The new search results to extend.
+        """
+        if self.search_results is None:
+            self.search_results = search_results
+        else:
+            self.search_results.chunks.extend(search_results.chunks)
+            self.search_results.source_ids.extend(search_results.source_ids)
+
+    def extend_text_results(self, text_result: str) -> None:
+        """
+        Extends the text result with additional data.
+
+        Args:
+            text_result (str): The new text result to extend.
+        """
+        if self.text_results is None:
+            self.text_results = [text_result]
+        else:
+            self.text_results.append(text_result)
+
+    def extend_tool_name(self, tool_name: str) -> None:
+        """
+        Extends the tool name with additional data.
+
+        Args:
+            tool_name (str): The new tool name to extend.
+        """
+        if self.tool_names is None:
+            self.tool_names = [tool_name]
+        else:
+            self.tool_names.append(tool_name)
+
+    @property
+    def to_text(self) -> str:
+        """
+        Generates formatted text from search results or returns the text result.
+    
+        If search_results exists, formats content from each chunk along with its source.
+        Otherwise, returns the text_result if available.
+    
+        Returns:
+            str: The formatted text from search results or the text result.
+        Raises:
+            ValueError: If neither search_results nor text_results are available.
+        """
+        if self.search_results and self.search_results.chunks:
+            formatted_chunks = []
+            for i, chunk in enumerate(self.search_results.chunks):
+                # Handle UnifiedDataChunk structure
+                content = chunk.content
+                metadata = chunk.metadata or {}
+    
+                source_info = f"Source: {metadata.title}"
+                if metadata.journal:
+                    source_info += f" - {metadata.journal}"
+                if metadata.published_date:
+                    source_info += f" ({metadata.published_date})"
+    
+                # Format the chunk with its content and source
+                formatted_chunk = f"### Chunk {i+1}\n{content}\n\n*{source_info}*\n"
+                formatted_chunks.append(formatted_chunk)
+    
+            return "\n---\n".join(formatted_chunks)
+        elif self.text_results:
+            return '\n---\n'.join(self.text_results)
+        else:
+            return "No search results or text results available."
+
+
+    @property
+    def get_chroma_ids(self) -> List[str]:
+        """
+        Returns the list of Chroma IDs from the search results.
+
+        Returns:
+            List[str]: The list of Chroma IDs.
+        """
+        if self.search_results and self.search_results.source_ids:
+            return self.search_results.source_ids
+        return []
+
+class ChunkSearchResults(BaseModel):
+    """
+    Represents the results of a search query across document collections.
+
+    Attributes:
+        chunks (List[DocumentChunk]): List of document chunks containing text and metadata.
+        chroma_ids (List[str]): List of Chroma IDs for the chunks.
+        arango_ids (List[str]): List of ArangoDB IDs for the related documents.
+    """
+
+    chunks: List[UnifiedDataChunk] = Field(
+        description="List of document chunks containing text, metadata, and relevance scores."
+    )
+    chroma_ids: List[str] = Field(
+        default_factory=list, description="List of Chroma IDs for the chunks"
+    )
+    arango_ids: List[str] = Field(
+        default_factory=list,
+        description="List of ArangoDB IDs for the related documents",
+    )
--- a/ollama_response_classes.py
+++ b/ollama_response_classes.py
@ -1,6 +0,0 @@
-from pydantic import BaseModel
-
-class QueryResponse(BaseModel):
-    query_to_vector_database: str
-    short_explanation: str
-    
--- a/projects_page.py
+++ b/projects_page.py
@ -42,10 +42,8 @@ class ProjectsPage(StreamlitBaseClass):
        self.update_session_state(self.page_name)

    def load_projects(self):
-        projects_cursor = self.user_arango.db.aql.execute(
-            "FOR doc IN projects RETURN doc", count=True
-        )
-        self.projects = list(projects_cursor)
+        # Get projects using the new API method
+        self.projects = self.user_arango.get_projects(username=self.username)

    def display_projects(self):
        with st.sidebar:
@ -53,7 +51,7 @@ class ProjectsPage(StreamlitBaseClass):
            projects = [proj["name"] for proj in self.projects]
            self.selected_project_name = st.selectbox(
                "Select a project to manage",
-                options=[proj["name"] for proj in self.projects],
+                options=projects,
                index=projects.index(self.project) if self.project in projects else None,
            )
        if self.selected_project_name:
@ -83,16 +81,16 @@ class ProjectsPage(StreamlitBaseClass):
        )
        if st.button("Create Project"):
            if new_project_name:
-                self.user_arango.db.collection("projects").insert(
-                    {
-                        "name": new_project_name,
-                        "description": new_project_description,
-                        "collections": [],
-                        "notes": [],
-                        "note_keys_hash": hash(""),
-                        "settings": {},
-                    }
-                )
+                # Use the API to create a new project
+                self.user_arango.create_project({
+                    "name": new_project_name,
+                    "description": new_project_description,
+                    "username": self.username,
+                    "collections": [],
+                    "notes": [],
+                    "note_keys_hash": hash(""),
+                    "settings": {},
+                })
                st.success(f'New project "{new_project_name}" created')
                st.session_state["new_project"] = False
                self.update_settings("current_project", new_project_name)
@ -105,11 +103,11 @@ class ProjectsPage(StreamlitBaseClass):
            st.markdown(self.project.notes_summary)

        with st.expander("Show project notes"):
-            notes_cursor = self.user_arango.db.aql.execute(
-                "FOR doc IN notes FILTER doc._id IN @note_ids RETURN doc",
-                bind_vars={"note_ids": self.project.notes},
+            # Use the API to get project notes
+            notes = self.user_arango.get_project_notes(
+                project_name=self.project.name,
+                username=self.username
            )
-            notes = list(notes_cursor)
            if notes:
                for note in notes:
                    st.markdown(f'_{note.get("timestamp", "")}_')
@ -126,21 +124,29 @@ class ProjectsPage(StreamlitBaseClass):

    def show_project_interviews(self):
        with st.expander("Show project interviews"):
-            if not self.user_arango.db.has_collection("interviews"):
-                self.user_arango.db.create_collection("interviews")
-            interviews_cursor = self.user_arango.db.aql.execute(
-                "FOR doc IN interviews FILTER doc.project == @project_name RETURN doc",
-                bind_vars={"project_name": self.project.name},
+            # Use the API to create collection if it doesn't exist
+            if not self.user_arango.has_collection("interviews"):
+                self.user_arango.create_collection("interviews")
+                
+            # Use the API to get interviews for this project
+            interviews = self.user_arango.execute_aql(
+                """
+                FOR doc IN interviews 
+                FILTER doc.project == @project_name
+                RETURN doc
+                """,
+                bind_vars={"project_name": self.project.name}
            )
-            interviews = list(interviews_cursor)
-            if interviews:
-                for interview in interviews:
+            
+            interviews_list = list(interviews)
+            if interviews_list:
+                for interview in interviews_list:
                    st.markdown(f'_{interview.get("timestamp", "")}_')
-                    if interview['intervievees']:
+                    if interview.get('intervievees'):
                        st.markdown(
                            f"**Interviewees:** {', '.join(interview['intervievees'])}"
                        )
-                    if interview['interviewer']:
+                    if interview.get('interviewer'):
                        st.markdown(f"**Interviewer:** {interview['interviewer']}")
                    if len(interview["transcript"].split("\n")) > 6:
                        preview = (
@ -186,8 +192,10 @@ class ProjectsPage(StreamlitBaseClass):
        self.sidebar_actions()
        self.project.update_notes_hash()
        if st.button(f":red[Remove project *{self.project.name}*]"):
-            self.user_arango.db.collection("projects").delete_match(
-                {"name": self.project.name}
+            # Use the API to delete the project
+            self.user_arango.delete_project(
+                project_name=self.project.name,
+                username=self.username
            )
            self.update_settings("current_project", None)
            st.success(f'Project "{self.project.name}" removed')
@ -196,12 +204,13 @@ class ProjectsPage(StreamlitBaseClass):
        self.update_session_state(self.page_name)

    def relate_collections(self):
-        collections = [
-            col["name"]
-            for col in self.user_arango.db.collection("article_collections").all()
-        ]
+        # Get all collections using the API
+        collections = self.user_arango.execute_aql(
+            "FOR c IN article_collections RETURN c.name"
+        )
+        collections_list = list(collections)
        selected_collections = st.multiselect(
-            "Relate existing collections", options=collections
+            "Relate existing collections", options=collections_list
        )
        if st.button("Relate Collections"):
            self.project.add_collections(selected_collections)
@ -214,8 +223,10 @@ class ProjectsPage(StreamlitBaseClass):
        )
        if st.button("Create and Relate Collection"):
            if new_collection_name:
-                self.user_arango.db.collection("article_collections").insert(
-                    {"name": new_collection_name, "articles": []}
+                # Use the API to insert a new collection
+                self.user_arango.insert_document(
+                    collection_name="article_collections",
+                    document={"name": new_collection_name, "articles": []}
                )
                self.project.add_collection(new_collection_name)
                st.success(
@ -248,6 +259,7 @@ class ProjectsPage(StreamlitBaseClass):

    def upload_notes_form(self):
        with st.expander("Upload notes"):
+
            with st.form("add_notes", clear_on_submit=True):
                files = st.file_uploader(
                    "Upload PDF or image",
@ -338,51 +350,6 @@ class Project(StreamlitBaseClass):
        A dictionary of settings for the project.
    notes_summary : str
        A summary of the notes in the project.
-
-    Methods:
-    --------
-    load_project():
-        Loads the project data from the ArangoDB.
-    update_project():
-        Updates the project data in the ArangoDB.
-    add_collections(collections):
-        Adds multiple collections to the project.
-    add_collection(collection_name):
-        Adds a single collection to the project.
-    add_note(note):
-        Adds a note to the project.
-    add_interview(interview, intervievees, interviewer, date_of_interveiw):
-        Adds an interview to the project.
-    add_interview_transcript(transcript, filename, intervievees, interviewer, date_of_interveiw):
-        Adds an interview transcript to the project.
-    transcribe(uploaded_file):
-        Transcribes an uploaded audio file.
-    format_transcription(transcription):
-        Formats the transcription text.
-    delete_note(note_id):
-        Deletes a note from the project.
-    delete_interview(interview_id):
-        Deletes an interview from the project.
-    update_notes_hash():
-        Updates the hash value of the notes.
-    make_project_notes_hash():
-        Generates a hash value for the project notes.
-    create_notes_summary():
-        Creates a summary of the project notes.
-    analyze_image(image_base64, text):
-        Analyzes an image and generates a description.
-    process_uploaded_notes(files):
-        Processes uploaded note files.
-    file2img(file):
-        Converts an uploaded file to an image.
-    convert_image_to_pdf(img):
-        Converts an image to a PDF file.
-    get_wikipedia_data(page_url):
-        Fetches data from a Wikipedia page.
-    process_wikipedia_data(wiki_data, wiki_url):
-        Processes Wikipedia data and adds it to the project.
-    process_dois(article_collection_name, text, dois):
-        Processes DOIs and adds the corresponding articles to the project.
    """
    def __init__(self, username: str, project_name: str, user_arango: ArangoDB):
        super().__init__(username=username)
@ -394,6 +361,7 @@ class Project(StreamlitBaseClass):
        self.note_keys_hash = 0
        self.settings = {}
        self.notes_summary = ""
+        self._key = None

        # Initialize attributes from arango doc if available
        self.load_project()
@ -401,14 +369,15 @@ class Project(StreamlitBaseClass):
    def load_project(self):
        print_blue("Project name:", self.name)

-        project_cursor = self.user_arango.db.aql.execute(
-            "FOR doc IN projects FILTER doc.name == @name RETURN doc",
-            bind_vars={"name": self.name},
+        # Use the API to get project details
+        project = self.user_arango.get_project(
+            project_name=self.name,
+            username=self.username
        )
-        project = next(project_cursor, None)

        if not project:
            raise ValueError(f"Project '{self.name}' not found.")
+            
        self._key = project["_key"]
        self.name = project.get("name", "")
        self.description = project.get("description", "")
@ -418,9 +387,10 @@ class Project(StreamlitBaseClass):
        self.settings = project.get("settings", {})
        self.notes_summary = project.get("notes_summary", "")

-
    def update_project(self):
+        # Use the API to update project details
        updated_doc = {
+            "_id": f"projects/{self._key}",
            "_key": self._key,
            "name": self.name,
            "description": self.description,
@ -429,8 +399,9 @@ class Project(StreamlitBaseClass):
            "note_keys_hash": self.note_keys_hash,
            "settings": self.settings,
            "notes_summary": self.notes_summary,
+            "username": self.username
        }
-        self.user_arango.db.collection("projects").update(updated_doc, check_rev=False)
+        self.user_arango.update_project(updated_doc)
        self.update_session_state()

    def add_collections(self, collections):
@ -448,7 +419,13 @@ class Project(StreamlitBaseClass):
        note["text"] = note["text"].strip().strip("\n")
        if "timestamp" not in note:
            note["timestamp"] = datetime.now().strftime("%Y-%m-%d %H:%M")
-        note_doc = self.user_arango.db.collection("notes").insert(note)
+            
+        # Use the API to add a note to the project
+        note["project"] = self.name
+        note["username"] = self.username
+        
+        note_doc = self.user_arango.add_note_to_project(note)
+        
        if note_doc["_id"] not in self.notes:
            self.notes.append(note_doc["_id"])
            self.update_project()
@ -534,8 +511,11 @@ class Project(StreamlitBaseClass):
            ]
        if not interviewer:
            interviewer = self.username
-        if not self.user_arango.db.has_collection("interviews"):
-            self.user_arango.db.create_collection("interviews")
+            
+        # Ensure interviews collection exists using the API
+        if not self.user_arango.has_collection("interviews"):
+            self.user_arango.create_collection("interviews")
+            
        if isinstance(date_of_interveiw, str):
            date_of_interveiw = datetime.strptime(date_of_interveiw, "%Y-%m-%d")

@ -553,8 +533,10 @@ class Project(StreamlitBaseClass):

        document.make_chunks(len_chunks=600)

-        self.user_arango.db.collection("interviews").insert(
-            {
+        # Use the API to insert the interview document
+        self.user_arango.insert_document(
+            collection_name="interviews",
+            document={
                "_key": _key,
                "transcript": transcript,
                "project": self.name,
@ -562,11 +544,10 @@ class Project(StreamlitBaseClass):
                "timestamp": timestamp,
                "intervievees": intervievees,
                "interviewer": interviewer,
-                "date_of_interveiw": date_of_interveiw,
+                "date_of_interveiw": date_of_interveiw.isoformat() if date_of_interveiw else None,
                "chunks": document.chunks,
            },
-            overwrite=True,
-            silent=True,
+            overwrite=True
        )

        document.make_summary_in_background()
@ -668,11 +649,18 @@ class Project(StreamlitBaseClass):
    def delete_note(self, note_id):
        if note_id in self.notes:
            self.notes.remove(note_id)
+            # Delete the note document using the API
+            self.user_arango.delete_document(
+                collection_name="notes",
+                document_key=note_id.split("/")[1]
+            )
            self.update_project()

    def delete_interview(self, interview_id):
-        self.user_arango.db.collection("interviews").delete_match(
-            {"_key": interview_id}
+        # Delete interview using the API
+        self.user_arango.delete_document(
+            collection_name="interviews",
+            document_key=interview_id
        )

    def update_notes_hash(self):
@ -690,12 +678,14 @@ class Project(StreamlitBaseClass):
        return hash(note_keys_str)

    def create_notes_summary(self):
-        notes_cursor = self.user_arango.db.aql.execute(
-            "FOR doc IN notes FILTER doc._id IN @note_ids RETURN doc.text",
-            bind_vars={"note_ids": self.notes},
-        )
-        notes = list(notes_cursor)
-        notes_string = "\n---\n".join(notes)
+        # Get note texts using the API
+        notes_list = []
+        for note_id in self.notes:
+            note = self.user_arango.get_document(note_id)
+            if note and "text" in note:
+                notes_list.append(note["text"])
+        
+        notes_string = "\n---\n".join(notes_list)
        llm = LLM(model="small")
        query = get_note_summary_prompt(self, notes_string)
        summary = llm.generate(query).content
@ -799,8 +789,17 @@ class Project(StreamlitBaseClass):
        )
        wiki_data.pop("summary", None)
        wiki_data.pop("content", None)
-        self.user_arango.db.collection("notes").insert(
-            wiki_data, overwrite=True, silent=True
+        
+        # Use the API to insert wiki data as a note
+        self.user_arango.insert_document(
+            collection_name="notes",
+            document={
+                **wiki_data,
+                "project": self.name,
+                "username": self.username,
+                "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M")
+            },
+            overwrite=True
        )
        self.add_note(wiki_data)

@ -846,3 +845,29 @@ class Project(StreamlitBaseClass):
                    _id=f"sci_articles/{fix_key(doi)}",
                )
        self.update_session_state()
+        
+    def articles2collection(self, collection, db, _id):
+        # Use the base/admin ArangoDB for general operations like adding to collections
+        base_arango = ArangoDB(db_name="base")
+        
+        # Get the collection
+        collection_doc = base_arango.execute_aql(
+            "FOR c IN article_collections FILTER c.name == @name RETURN c",
+            bind_vars={"name": collection}
+        )
+        
+        try:
+            collection_doc = next(collection_doc)
+            if _id not in collection_doc["articles"]:
+                collection_doc["articles"].append(_id)
+                # Update the collection
+                base_arango.update_document(collection_doc)
+        except StopIteration:
+            # Collection doesn't exist, create it
+            base_arango.insert_document(
+                collection_name="article_collections",
+                document={
+                    "name": collection,
+                    "articles": [_id]
+                }
+            )
--- a/research_page.py
+++ b/research_page.py
@ -4,7 +4,13 @@ from colorprinter.print_color import *

 from _base_class import StreamlitBaseClass
 from projects_page import Project
-from agent_research import ResearchReport, MasterAgent, StructureAgent, ToolAgent, ArchiveAgent, process_step
+from agent_research import (
+    ResearchReport,
+    MasterAgent,
+    StructureAgent,
+    ToolAgent,
+    ArchiveAgent,
+)
 import os
 import json

@ -12,11 +18,11 @@ import json
 class ResearchPage(StreamlitBaseClass):
    """
    ResearchPage - A Streamlit interface for deep research using AI agents.
-    
+
    This class provides a user interface for conducting in-depth research using
    multiple specialized AI agents working together. It allows users to input
    research questions, track progress, and view detailed research reports.
-    
+
    Attributes:
        username (str): The username of the current user.
        project_name (str): Name of the selected project.
@ -24,7 +30,7 @@ class ResearchPage(StreamlitBaseClass):
        page_name (str): Name of the current page ("Research").
        research_state (dict): Dictionary tracking the current state of research.
        report (ResearchReport): Instance for tracking research progress and results.
-    
+
    Methods:
        run(): Main method to render the research interface and handle interactions.
        sidebar_actions(): Renders sidebar elements for selecting projects and research options.
@ -33,12 +39,13 @@ class ResearchPage(StreamlitBaseClass):
        display_report(): Renders a research report in the Streamlit interface.
        show_research_progress(): Displays the current research progress.
    """
+
    def __init__(self, username):
        super().__init__(username=username)
        self.project_name = None
        self.project = None
        self.page_name = "Research"
-        
+
        # Research state tracking
        self.research_state = {
            "in_progress": False,
@ -48,29 +55,29 @@ class ResearchPage(StreamlitBaseClass):
            "report": None,
            "current_step": None,
            "steps_completed": 0,
-            "total_steps": 0
+            "total_steps": 0,
        }
-        
+
        self.report = None
-        
+
        # Initialize attributes from session state if available
        if self.page_name in st.session_state:
            for k, v in st.session_state[self.page_name].items():
                setattr(self, k, v)
-                
+
        # Create reports directory if it doesn't exist
        os.makedirs(f"/home/lasse/sci/reports", exist_ok=True)

    def run(self):
        self.update_current_page("Research")
        self.sidebar_actions()
-        
+
        st.title("Deep Research")
-        
+
        if not self.project:
            st.warning("Please select a project to start researching.")
            return
-            
+
        # Main interface
        if self.research_state["in_progress"]:
            self.show_research_progress()
@ -80,24 +87,26 @@ class ResearchPage(StreamlitBaseClass):
            # Input for new research
            st.subheader(f"New Research for Project: {self.project_name}")
            with st.form("research_form"):
-                question = st.text_area("Enter your research question:", 
-                                       help="Be specific about what you want to research. Complex questions will be broken down into sub-questions.")
+                question = st.text_area(
+                    "Enter your research question:",
+                    help="Be specific about what you want to research. Complex questions will be broken down into sub-questions.",
+                )
                start_button = st.form_submit_button("Start Research")
-                
+
                if start_button and question:
                    self.start_new_research(question)
                    st.rerun()
-            
+
            # Option to view saved reports
            with st.expander("View Saved Reports"):
                self.view_saved_reports()
-    
+
    def sidebar_actions(self):
        with st.sidebar:
            with st.form("select_project"):
                self.project = self.choose_project("Project for research:")
                submitted = st.form_submit_button("Select Project")
-                
+
                if submitted and self.project:
                    self.project_name = self.project.name
                    st.success(f"Selected project: {self.project_name}")
@ -107,7 +116,7 @@ class ResearchPage(StreamlitBaseClass):
                if st.button("Cancel Research"):
                    self.research_state["in_progress"] = False
                    st.rerun()
-                    
+
            elif self.research_state["completed"]:
                if st.button("Start New Research"):
                    self.research_state["completed"] = False
@ -120,22 +129,20 @@ class ResearchPage(StreamlitBaseClass):
        self.research_state["in_progress"] = True
        self.research_state["completed"] = False
        self.research_state["started_at"] = datetime.now().isoformat()
-        
+
        # Initialize the research report
        self.report = ResearchReport(
-            question=question, 
-            username=self.username, 
-            project_name=self.project_name
+            question=question, username=self.username, project_name=self.project_name
        )
-        
+
        # Save current state
        st.session_state[self.page_name] = {
            "project_name": self.project_name,
            "project": self.project,
            "research_state": self.research_state,
-            "report": self.report
+            "report": self.report,
        }
-        
+
        # Start a new thread to run the research process
        # In a production environment, you might want to use a background job
        # For now, we'll run it in the main thread with streamlit spinner
@ -143,15 +150,13 @@ class ResearchPage(StreamlitBaseClass):
            try:
                # Initialize agents
                master_agent = MasterAgent(
-                    username=self.username, 
-                    project=self.project, 
-                    report=self.report, 
-                    chat=True
+                    username=self.username,
+                    project=self.project,
+                    report=self.report,
+                    chat=True,
                )
                structure_agent = StructureAgent(
-                    username=self.username, 
-                    model="small", 
-                    report=self.report
+                    username=self.username, model="small", report=self.report
                )
                tool_agent = ToolAgent(
                    username=self.username,
@ -159,78 +164,78 @@ class ResearchPage(StreamlitBaseClass):
                    system_message="You are an assistant with tools. Always choose a tool to help with the task.",
                    report=self.report,
                    project=self.project,
-                    chat=True
+                    chat=True,
                )
                archive_agent = ArchiveAgent(
                    username=self.username,
                    report=self.report,
                    project=self.project,
                    system_message="You are an assistant specialized in reading and summarizing research information.",
-                    chat=True
+                    chat=True,
                )
-                
+
                # Track the research state in the master agent
                master_agent.research_state["original_question"] = question
-                
+
                # Execute the research workflow
                # 1. Create research plan
                st.text("Creating research plan...")
                research_plan = master_agent.make_plan(question)
                self.report.log_plan(research_plan)
-                
+
                # 2. Structure the plan
                st.text("Structuring research plan...")
-                structured_plan = structure_agent.make_structured(research_plan, question)
+                structured_plan = structure_agent.make_structured(
+                    research_plan, question
+                )
                self.report.log_plan(research_plan, structured_plan.model_dump())
-                
+
                # Update total steps count
                self.research_state["total_steps"] = len(structured_plan.steps)
-                
+
                # 3. Execute the plan step by step
                execution_results = {}
-                
+
                for step_name, tasks in structured_plan.steps.items():
                    st.text(f"Processing step: {step_name}")
                    self.research_state["current_step"] = step_name
                    self.research_state["steps_completed"] += 1
-                    
+
                    # Collect all task descriptions in this step
                    step_tasks = [
                        {"task_name": task_name, "task_description": task_description}
                        for task_name, task_description in tasks
                    ]
-                    
+
                    # Process the entire step
-                    step_result = process_step(
-                        step_name, step_tasks, master_agent, tool_agent, archive_agent
-                    )
+                    step_result = master_agent.process_step(step_name, step_tasks)
                    execution_results[step_name] = step_result
-                
+
                # 4. Evaluate if more steps are needed
                st.text("Evaluating research plan...")
                plan_evaluation = master_agent.evaluate_plan(execution_results)
                self.report.log_plan_evaluation(plan_evaluation)
-                
+
                # 5. Write the final report
                st.text("Writing final report...")
                final_report = master_agent.write_report(execution_results)
                self.report.log_final_report(final_report)
-                
+
                # 6. Save the reports
                timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
                report_path = f"/home/lasse/sci/reports/research_report_{self.username}_{timestamp}"
-                
+
                # Save JSON report
                json_path = f"{report_path}.json"
                with open(json_path, "w") as f:
                    json.dump(self.report.get_full_report(), f, indent=2)
-                
+
                # Save markdown report
                markdown_report = self.report.get_markdown_report()
                markdown_path = f"{report_path}.md"
                with open(markdown_path, "w") as f:
                    f.write(markdown_report)
-                
+
                # Update research state
                self.research_state["in_progress"] = False
                self.research_state["completed"] = True
@ -238,21 +243,22 @@ class ResearchPage(StreamlitBaseClass):
                    "json_path": json_path,
                    "markdown_path": markdown_path,
                    "report_data": self.report.get_full_report(),
-                    "markdown_content": markdown_report
+                    "markdown_content": markdown_report,
                }
-                
+
            except Exception as e:
                st.error(f"An error occurred during research: {str(e)}")
                import traceback
+
                st.code(traceback.format_exc())
                self.research_state["in_progress"] = False
-        
+
        # Update session state
        st.session_state[self.page_name] = {
            "project_name": self.project_name,
            "project": self.project,
            "research_state": self.research_state,
-            "report": self.report
+            "report": self.report,
        }

    def view_saved_reports(self):
@ -261,58 +267,68 @@ class ResearchPage(StreamlitBaseClass):
        if not os.path.exists(reports_dir):
            st.info("No saved reports found.")
            return
-            
+
        # Get all report files
-        json_files = [f for f in os.listdir(reports_dir) if f.endswith('.json') and f.startswith('research_report')]
-        
+        json_files = [
+            f
+            for f in os.listdir(reports_dir)
+            if f.endswith(".json") and f.startswith("research_report")
+        ]
+
        if not json_files:
            st.info("No saved reports found.")
            return
-            
+
        for file in sorted(json_files, reverse=True):
            file_path = os.path.join(reports_dir, file)
            try:
-                with open(file_path, 'r') as f:
+                with open(file_path, "r") as f:
                    report_data = json.load(f)
-                    
+
                # Extract basic info
-                question = report_data.get("metadata", {}).get("question", "Unknown question")
-                project = report_data.get("metadata", {}).get("project_name", "No project")
-                started_at = report_data.get("metadata", {}).get("started_at", "Unknown time")
-                
+                question = report_data.get("metadata", {}).get(
+                    "question", "Unknown question"
+                )
+                project = report_data.get("metadata", {}).get(
+                    "project_name", "No project"
+                )
+                started_at = report_data.get("metadata", {}).get(
+                    "started_at", "Unknown time"
+                )
+
                # Format the date
                try:
                    date_obj = datetime.fromisoformat(started_at)
                    date_str = date_obj.strftime("%Y-%m-%d %H:%M")
                except:
                    date_str = started_at
-                
+
                # Create an expandable section for each report
                st.markdown(f"_{question} ({project} - {date_str})_")
                st.markdown(f"**Project:** {project}")
                st.markdown(f"**Date:** {date_str}")
-                
+
                # Button to view full report
                if st.button("View Full Report", key=f"view_{file}"):
                    # Load corresponding markdown file if it exists
-                    md_file = file.replace('.json', '.md')
+                    md_file = file.replace(".json", ".md")
                    md_path = os.path.join(reports_dir, md_file)
-                    
+
                    if os.path.exists(md_path):
-                        with open(md_path, 'r') as f:
+                        with open(md_path, "r") as f:
                            markdown_content = f.read()
                    else:
                        markdown_content = None
-                        
+
                    self.research_state["completed"] = True
                    self.research_state["report"] = {
                        "json_path": file_path,
                        "markdown_path": md_path if os.path.exists(md_path) else None,
                        "report_data": report_data,
-                        "markdown_content": markdown_content
+                        "markdown_content": markdown_content,
                    }
                    st.rerun()
-                        
+
            except Exception as e:
                st.error(f"Error loading report {file}: {str(e)}")

@ -321,13 +337,13 @@ class ResearchPage(StreamlitBaseClass):
        if not report_data:
            st.warning("No report data available.")
            return
-            
+
        st.title("Research Report")
-        
+
        # Get report data
        markdown_content = report_data.get("markdown_content")
        json_data = report_data.get("report_data")
-        
+
        if markdown_content:
            # Display the markdown report
            st.markdown(markdown_content)
@ -335,80 +351,91 @@ class ResearchPage(StreamlitBaseClass):
            # Fallback to displaying JSON data in a more readable format
            question = json_data.get("metadata", {}).get("question", "Unknown question")
            st.header(f"Research on: {question}")
-            
+
            # Display metadata
            st.subheader("Metadata")
            metadata = json_data.get("metadata", {})
            st.markdown(f"**Project:** {metadata.get('project_name', 'None')}")
            st.markdown(f"**Started:** {metadata.get('started_at', 'Unknown')}")
            st.markdown(f"**Finished:** {metadata.get('finished_at', 'Unknown')}")
-            
+
            # Display final report
            st.subheader("Research Findings")
            st.markdown(json_data.get("final_report", "No final report available."))
-            
+
            # Display steps
            st.subheader("Research Steps")
            steps = json_data.get("steps", {})
            for step_name, step_data in steps.items():
                with st.expander(step_name):
-                    st.markdown(f"**Summary:** {step_data.get('summary', 'No summary available.')}")
-                    
+                    st.markdown(
+                        f"**Summary:** {step_data.get('summary', 'No summary available.')}"
+                    )
+
                    # Display tools used
                    st.markdown("**Tools used:**")
                    for tool in step_data.get("tools_used", []):
-                        st.markdown(f"- {tool.get('tool', 'Unknown tool')} with query: _{tool.get('args', {}).get('query', 'No query')}_")
-            
+                        st.markdown(
+                            f"- {tool.get('tool', 'Unknown tool')} with query: _{tool.get('args', {}).get('query', 'No query')}_"
+                        )
+
        else:
            st.error("No report content available to display.")
-        
+
        # Download buttons
        col1, col2 = st.columns(2)
        with col1:
-            if report_data.get("markdown_path") and os.path.exists(report_data["markdown_path"]):
+            if report_data.get("markdown_path") and os.path.exists(
+                report_data["markdown_path"]
+            ):
                with open(report_data["markdown_path"], "r") as f:
                    markdown_content = f.read()
                    st.download_button(
                        label="Download as Markdown",
                        data=markdown_content,
                        file_name=os.path.basename(report_data["markdown_path"]),
-                        mime="text/markdown"
+                        mime="text/markdown",
                    )
-                    
+
        with col2:
-            if report_data.get("json_path") and os.path.exists(report_data["json_path"]):
+            if report_data.get("json_path") and os.path.exists(
+                report_data["json_path"]
+            ):
                with open(report_data["json_path"], "r") as f:
                    json_content = f.read()
                    st.download_button(
                        label="Download as JSON",
                        data=json_content,
                        file_name=os.path.basename(report_data["json_path"]),
-                        mime="application/json"
+                        mime="application/json",
                    )

    def show_research_progress(self):
        """Displays the current research progress"""
        st.subheader("Research in Progress")
        st.markdown(f"**Question:** {self.research_state['question']}")
-        
+
        # Show progress bar
        progress = 0
        if self.research_state["total_steps"] > 0:
-            progress = self.research_state["steps_completed"] / self.research_state["total_steps"]
-        
+            progress = (
+                self.research_state["steps_completed"]
+                / self.research_state["total_steps"]
+            )
+
        st.progress(progress)
-        
+
        # Show current step
        current_step = self.research_state.get("current_step", "Planning")
        st.markdown(f"**Current step:** {current_step}")
-        
+
        # Display research plan and progress in expandable sections
        if self.report:
            with st.expander("Research Plan", expanded=True):
                if self.report.report["plan"]["original_text"]:
                    st.markdown("### Original Research Plan")
                    st.markdown(self.report.report["plan"]["original_text"])
-                
+
                if self.report.report["plan"]["structured"]:
                    st.markdown("### Structured Plan")
                    structured_plan = self.report.report["plan"]["structured"]
@ -416,7 +443,7 @@ class ResearchPage(StreamlitBaseClass):
                        st.markdown(f"**{step_name}**")
                        for task_name, task_description in tasks:
                            st.markdown(f"- {task_name}: {task_description}")
-            
+
            # Show completed steps
            if self.report.report["steps"]:
                with st.expander("Completed Steps", expanded=True):
@ -426,25 +453,29 @@ class ResearchPage(StreamlitBaseClass):
                            st.markdown(f"### {step_name}")
                            if step_data.get("summary"):
                                st.markdown(f"**Summary:** {step_data['summary']}")
-                            
+
                            # Show tools used
                            if step_data.get("tools_used"):
                                st.markdown("**Tools used:**")
                                for tool in step_data["tools_used"]:
-                                    st.markdown(f"- {tool.get('tool')} with query: _{tool.get('args', {}).get('query', 'No query')}_")
-            
+                                    st.markdown(
+                                        f"- {tool.get('tool')} with query: _{tool.get('args', {}).get('query', 'No query')}_"
+                                    )
+
            # Show information gathering in the current step
            current_step_data = self.report.report["steps"].get(current_step, {})
            if current_step_data and not current_step_data.get("finished_at"):
                with st.expander("Current Step Progress", expanded=True):
                    st.markdown(f"### {current_step}")
-                    
+
                    # Show tools used in current step
                    if current_step_data.get("tools_used"):
                        st.markdown("**Tools used so far:**")
                        for tool in current_step_data["tools_used"]:
-                            st.markdown(f"- {tool.get('tool')} with query: _{tool.get('args', {}).get('query', 'No query')}_")
-                    
+                            st.markdown(
+                                f"- {tool.get('tool')} with query: _{tool.get('args', {}).get('query', 'No query')}_"
+                            )
+
                    # Show information gathered so far
                    if current_step_data.get("information_gathered"):
                        st.markdown("**Information gathered:**")
@ -454,6 +485,10 @@ class ResearchPage(StreamlitBaseClass):
                                if source not in sources_seen:
                                    st.markdown(f"- {source}")
                                    sources_seen.add(source)
-        
-        st.info("Research is ongoing. This may take several minutes depending on the complexity of the question.")
-        st.warning("Please do not navigate away from this page while research is in progress.")
+
+        st.info(
+            "Research is ongoing. This may take several minutes depending on the complexity of the question."
+        )
+        st.warning(
+            "Please do not navigate away from this page while research is in progress."
+        )
--- a/streamlit_app.py
+++ b/streamlit_app.py
@ -11,14 +11,25 @@ from _arango import ArangoDB

 def get_settings():
    """
-    Function to get the settings from the ArangoDB.
+    Function to get the settings from the ArangoDB using the new API.
    """
+    if "username" not in st.session_state:
+        return {}
+        
+    # Create ArangoDB instance with user's database
    arango = ArangoDB(db_name=st.session_state["username"])
-    settings = arango.db.collection("settings").get("settings")
+    
+    # Use the get_settings method from the new API
+    settings = arango.get_settings()
+    
    if settings:
        st.session_state["settings"] = settings
    else:
-        st.session_state["settings"] = {'current_collection': None, 'current_page': None}
+        # Initialize default settings if none exist
+        default_settings = {'current_collection': None, 'current_page': None}
+        arango.initialize_settings(default_settings)
+        st.session_state["settings"] = default_settings
+        
    return st.session_state["settings"]


@ -40,19 +51,21 @@ except LoginError as e:
    st.error(e)

 if st.session_state["authentication_status"]:
+    # Set username in session state
+    st.session_state["username"] = st.session_state["username"]
+    
    sleep(0.1)
    # Retry mechanism for importing get_settings
    for _ in range(3):
        try:
            get_settings()
-        except ImportError as e:
+        except Exception as e:
            sleep(0.3)
-            print_red(e)
-            print("Retrying to import get_settings...")
+            print_red(f"Error getting settings: {e}")
+            print("Retrying to get settings...")

    # Retry mechanism for importing pages
    for _ in range(3):
-
        try:
            from streamlit_pages import (
                Article_Collections,
@ -63,7 +76,6 @@ if st.session_state["authentication_status"]:
                Research,
                Search_Papers
            )
-
            break
        except ImportError as e:
            # Write the full error traceback
@ -90,7 +102,6 @@ if st.session_state["authentication_status"]:
    research = st.Page(Research)
    search_papers = st.Page(Search_Papers)

-
    sleep(0.1)
    pg = st.navigation([bot_chat, projects, article_collections, research, search_papers, rss_feeds, settings])
    sleep(0.1)
@ -112,13 +123,14 @@ if st.session_state["authentication_status"]:
    #     session_state = st.session_state.to_dict()
    #     if 'bot' in session_state:
    #         del session_state['bot']
-    #     arango.db.collection("error_logs").insert(
-    #         {
+    #     arango.insert_document(
+    #         collection_name="error_logs",
+    #         document={
    #             "error": traceback_string,
    #             "_key": timestamp,
    #             "session_state": session_state,
    #         },
-    #         overwrite=True,
+    #         overwrite=True
    #     )
    #     with st.status(":red[An error occurred. The site will be reloaded.]"):
    #         for i in range(5):
--- a/streamlit_chatbot.py
+++ b/streamlit_chatbot.py
--- a/test.py
+++ b/test.py
--- a/tortoise.py
+++ b/tortoise.py
@ -1,31 +0,0 @@
-from TTS.api import TTS
-import torch
-from datetime import datetime
-tts = TTS("tts_models/en/multi-dataset/tortoise-v2")
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-tts.to(device)
-text="There is, therefore, an increasing need to understand BEVs from a systems perspective. This involves an in-depth consideration of the environmental impact of the product using life cycle assessment (LCA) as well as taking a broader 'circular economy' approach. On the one hand, LCA is a means of assessing the environmental impact associated with all stages of a product's life from cradle to grave: from raw material extraction and processing to the product's manufacture to its use in everyday life and finally to its end of life."
-
-
-# cloning `lj` voice from `TTS/tts/utils/assets/tortoise/voices/lj`
-# with custom inference settings overriding defaults.
-time_now = datetime.now().strftime("%Y%m%d%H%M%S")
-output_path = f"output/tortoise_{time_now}.wav"
-tts.tts_to_file(text,
-                file_path=output_path,
-                voice_dir="voices",
-                speaker="test",
-                split_sentences=False, # Change to True if context is not enough
-                num_autoregressive_samples=20,
-                diffusion_iterations=50)
-
-# # Using presets with the same voice
-# tts.tts_to_file(text,
-#                 file_path="output.wav",
-#                 voice_dir="path/to/tortoise/voices/dir/",
-#                 speaker="lj",
-#                 preset="ultra_fast")
-
-# # Random voice generation
-# tts.tts_to_file(text,
-#                 file_path="output.wav")
--- a/test_and_view.py
+++ b/test_and_view.py
@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+"""
+Test LLM Server and View Results
+
+This script sends a test document to the LLM server for summarization,
+waits for processing to complete, and displays the results.
+
+Usage:
+    python test_and_view.py [--wait SECONDS] [--retries COUNT]
+
+Options:
+    --wait SECONDS    Number of seconds to wait between polling attempts (default: 5)
+    --retries COUNT   Maximum number of polling attempts (default: 20)
+"""
+
+import requests
+import json
+import time
+import os
+import argparse
+import sys
+from _arango import ArangoDB
+
+
+def send_test_document():
+    """Send a test document to the LLM server for summarization."""
+    print("Sending test document to LLM server...")
+    
+    # Define server endpoint
+    url = "http://localhost:8100/summarise_document"
+    
+    # Create a sample document with unique ID based on timestamp
+    doc_id = f"test_articles/climate_impact_{int(time.time())}"
+    
+    sample_document = {
+        "arango_doc": {
+            "text": """
+            The Impact of Climate Change on Coral Reefs
+            
+            Climate change has significantly affected marine ecosystems worldwide, with coral reefs being among the most vulnerable. 
+            Rising sea temperatures have led to increased coral bleaching events, where corals expel their symbiotic algae, 
+            leading to whitening and potential death. Studies show that even a 1-2°C increase in water temperature 
+            can trigger mass bleaching events. Additionally, ocean acidification caused by increased CO2 absorption 
+            makes it difficult for corals to build their calcium carbonate skeletons. 
+            
+            Recent research by Johnson et al. (2023) suggests that if current trends continue, we may lose up to 90% 
+            of coral reefs by 2050. However, some corals have shown remarkable resilience. Certain species can adapt 
+            to higher temperatures through a process called adaptive bleaching, where they exchange their algal symbionts 
+            for more heat-tolerant varieties. Conservation efforts focused on cultivating these resilient species may 
+            provide hope for reef preservation.
+            """,
+            "chunks": []
+        },
+        "arango_db_name": "test_db",
+        "arango_id": doc_id,
+        "is_sci": True
+    }
+    
+    try:
+        # Send request to server
+        response = requests.post(url, json=sample_document)
+        
+        if response.status_code == 200:
+            print("✓ Request accepted by server")
+            print(f"Document ID: {doc_id}")
+            return {
+                "db_name": "test_db",
+                "doc_id": doc_id
+            }
+        else:
+            print(f"✗ Error: {response.status_code}")
+            print(response.text)
+            return None
+    except Exception as e:
+        print(f"✗ Connection error: {e}")
+        return None
+
+
+def poll_for_results(doc_info, max_retries=20, wait_time=5):
+    """Poll the database until the document is summarized."""
+    if not doc_info:
+        return None
+    
+    db_name = doc_info["db_name"]
+    doc_id = doc_info["doc_id"]
+    
+    print(f"\nPolling for results in {db_name}/{doc_id}...")
+    print(f"Will check every {wait_time} seconds, up to {max_retries} times.")
+    
+    arango = ArangoDB(db_name=db_name)
+    
+    for attempt in range(max_retries):
+        print(f"Attempt {attempt+1}/{max_retries}... ", end="", flush=True)
+        
+        try:
+            # Get the document from ArangoDB
+            document = arango.get_document(doc_id)
+            
+            # Check if the document has been summarized
+            if document and "summary" in document:
+                print("✓ Document summary found!")
+                return document
+                
+            print("Document exists but no summary yet")
+            time.sleep(wait_time)
+            
+        except Exception as e:
+            print(f"Error: {e}")
+            time.sleep(wait_time)
+    
+    print("\n✗ Summarization not completed after maximum retries.")
+    return None
+
+
+def display_results(document):
+    """Display the summarization results."""
+    if not document:
+        print("\nNo results to display")
+        return
+    
+    print("\n" + "=" * 80)
+    print(f"RESULTS FOR DOCUMENT: {document.get('_id', 'Unknown')}")
+    print("=" * 80)
+    
+    # Document summary
+    print("\n📄 DOCUMENT SUMMARY")
+    print("-" * 80)
+    print(document["summary"]["text_sum"])
+    
+    # Model info if available
+    if "meta" in document["summary"]:
+        meta = document["summary"]["meta"]
+        model = meta.get("model", "Unknown")
+        temp = meta.get("temperature", "Unknown")
+        print(f"\nGenerated using: {model} (temperature: {temp})")
+    
+    # Check for summarized chunks
+    if "chunks" in document and document["chunks"]:
+        summarized_chunks = [chunk for chunk in document["chunks"] if "summary" in chunk]
+        print(f"\n🧩 CHUNK SUMMARIES ({len(summarized_chunks)}/{len(document['chunks'])} chunks processed)")
+        
+        for i, chunk in enumerate(summarized_chunks):
+            print("\n" + "-" * 80)
+            print(f"Chunk {i+1}:")
+            print("-" * 80)
+            print(chunk["summary"])
+            
+            # Display tags
+            if "tags" in chunk and chunk["tags"]:
+                print("\nTags:", ", ".join(chunk["tags"]))
+            
+            # Display references
+            if "references" in chunk and chunk["references"]:
+                print("\nReferences:")
+                for ref in chunk["references"]:
+                    print(f"- {ref}")
+    
+    print("\n" + "=" * 80)
+    
+    # Provide links to web views
+    print("\nView in browser:")
+    print("- HTML view: http://localhost:8100/html_results")
+    print("- JSON view: http://localhost:8100/view_results")
+
+
+def check_server_status():
+    """Check if the LLM server is running."""
+    try:
+        response = requests.get("http://localhost:8100/latest_result", timeout=2)
+        return True
+    except:
+        return False
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Test LLM server and view results')
+    parser.add_argument('--wait', type=int, default=5, help='Seconds to wait between polling attempts')
+    parser.add_argument('--retries', type=int, default=20, help='Maximum number of polling attempts')
+    args = parser.parse_args()
+    
+    print("LLM Server Test and View")
+    print("======================\n")
+    
+    # Check if server is running
+    if not check_server_status():
+        print("ERROR: Cannot connect to LLM server at http://localhost:8100")
+        print("Make sure the server is running before continuing.")
+        sys.exit(1)
+    
+    print("✓ Server is running\n")
+    
+    # Send test document
+    doc_info = send_test_document()
+    if not doc_info:
+        print("Failed to send test document")
+        sys.exit(1)
+    
+    print("\n⏳ Processing document...")
+    print("(This may take some time depending on model size and document complexity)")
+    
+    # Poll for results
+    result = poll_for_results(doc_info, max_retries=args.retries, wait_time=args.wait)
+    
+    # Display results
+    display_results(result)
+
+
+if __name__ == "__main__":
+    main()
--- a/test_fairseq.py
+++ b/test_fairseq.py
@ -1,51 +0,0 @@
-from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
-from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
-from fairseq import utils
-import nltk
-import torch
-
-# Download the required NLTK resource
-nltk.download('averaged_perceptron_tagger')
-
-# Model loading
-models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
-    "facebook/fastspeech2-en-ljspeech",
-    arg_overrides={"vocoder": "hifigan", "fp16": False}
-)
-
-# Set device
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-# Move all models to the correct device
-for model in models:
-    model.to(device)
-
-# Update configuration and build generator after moving models
-TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
-generator = task.build_generator(models, cfg)
-
-# Ensure the vocoder is on the correct device
-generator.vocoder.model.to(device)
-
-# Define your text
-text = """Hi there, thanks for having me! My interest in electric cars really started back when I was a teenager..."""
-
-# Convert text to model input
-sample = TTSHubInterface.get_model_input(task, text)
-
-# Recursively move all tensors in sample to the correct device
-sample = utils.move_to_cuda(sample) if torch.cuda.is_available() else sample
-
-
-
-# Generate speech
-wav, rate = TTSHubInterface.get_prediction(task, models[0], generator, sample)
-
-from scipy.io.wavfile import write
-
-# If wav is a tensor, convert it to a NumPy array
-if isinstance(wav, torch.Tensor):
-    wav = wav.cpu().numpy()
-
-# Save the audio to a WAV file
-write('output_fair.wav', rate, wav)
--- a/test_highlight.py
+++ b/test_highlight.py
@ -1,91 +0,0 @@
-import asyncio
-import re
-from pdf_highlighter import Highlighter
-from _chromadb import ChromaDB
-from _llm import LLM
-import ollama
-from colorprinter.print_color import *
-from concurrent.futures import ThreadPoolExecutor
-
-# Wrap the synchronous generate method
-async def async_generate(llm, prompt):
-    loop = asyncio.get_event_loop()
-    with ThreadPoolExecutor() as pool:
-        return await loop.run_in_executor(pool, llm.generate, prompt)
-
-
-# Define the main asynchronous function to highlight the PDFs
-async def highlight_pdf(data):
-    # Use the highlight method to highlight the relevant sentences in the PDFs
-    highlighted_pdf_buffer = await highlighter.highlight(
-        data=data, zero_indexed_pages=True  # Pages are zero-based (e.g., 0, 1, 2, ...)
-    )
-
-    # Save the highlighted PDF to a new file
-    with open("highlighted_combined_documents.pdf", "wb") as f:
-        f.write(highlighted_pdf_buffer.getbuffer())
-        print_green("PDF highlighting completed successfully!")
-
-
-# Initialize ChromaDB client
-chromadb = ChromaDB()
-
-# Define the query to fetch relevant text snippets and metadata from ChromaDB
-query = "How are climate researchers advocating for change in the society?"
-
-
-# Perform the query on ChromaDB
-result = chromadb.query(query, collection="sci_articles", n_results=5)
-# Use zip to combine the lists into a list of dictionaries
-results = [
-    {"id": id_, "metadata": metadata, "document": document, "distance": distance}
-    for id_, metadata, document, distance in zip(
-        result["ids"][0],
-        result["metadatas"][0],
-        result["documents"][0],
-        result["distances"][0],
-    )
-]
-
-for r in results:
-    print_rainbow(r["metadata"])
-    print_yellow(type(r["metadata"]['pages']))
-# Ask a LLM a question about the text snippets
-llm = LLM(model="small")
-documents_string = "\n\n---\n\n".join(result["documents"][0])
-answer = llm.generate(
-    f'''{query} Write your answer from the information below?\n\n"""{documents_string}"""\n\n{query}'''
-)
-print_green(answer)
-# Now you want to highlight relevant information in the PDFs to understand what the LLM is using!
-
-# Each result from ChromaDB contains the PDF filename and the pages where the text is found
-data = []
-for result in results:
-    pages = result["metadata"].get("pages")
-    try:
-        pages = [int(pages)]
-    except:
-        # Use re to extraxt the page numbers separated by commas
-        pages = list(map(int, re.findall(r"\d+", pages)))
-    
-    data.append(
-        {
-            "user_input": query,
-            "pdf_filename": result["metadata"]["_id"],
-            "pages": pages,
-            'chunk': result['document']
-        }
-    )
-
-# Initialize the Highlighter
-highlighter = Highlighter(
-    llm=llm,  # Pass the LLM to the Highlighter
-    comment=False,  # Enable comments to understand the context
-    use_llm=False
-)
-
-
-
-# Run the main function using asyncio
-asyncio.run(highlight_pdf(data))
--- a/test_llm_server.py
+++ b/test_llm_server.py
@ -0,0 +1,191 @@
+import requests
+import json
+import time
+from _arango import ArangoDB  # Import ArangoDB client to fetch results
+
+def test_summarize_document():
+    """
+    Test the document summarization functionality of the LLM server by sending a POST request
+    to the summarize_document endpoint.
+    
+    This function creates a sample document, sends it to the LLM server, and then polls for results.
+    """
+    print("Testing document summarization...")
+    
+    # Define server endpoint
+    url = "http://localhost:8100/summarise_document"
+    
+    # Create a sample document
+    sample_document = {
+        "arango_doc": {
+            "text": """
+            The Impact of Climate Change on Coral Reefs
+            
+            Climate change has significantly affected marine ecosystems worldwide, with coral reefs being among the most vulnerable. 
+            Rising sea temperatures have led to increased coral bleaching events, where corals expel their symbiotic algae, 
+            leading to whitening and potential death. Studies show that even a 1-2°C increase in water temperature 
+            can trigger mass bleaching events. Additionally, ocean acidification caused by increased CO2 absorption 
+            makes it difficult for corals to build their calcium carbonate skeletons. 
+            
+            Recent research by Johnson et al. (2023) suggests that if current trends continue, we may lose up to 90% 
+            of coral reefs by 2050. However, some corals have shown remarkable resilience. Certain species can adapt 
+            to higher temperatures through a process called adaptive bleaching, where they exchange their algal symbionts 
+            for more heat-tolerant varieties. Conservation efforts focused on cultivating these resilient species may 
+            provide hope for reef preservation.
+            """,
+            "chunks": []
+        },
+        "arango_db_name": "test_db",
+        "arango_id": "articles/test_article",
+        "is_sci": True
+    }
+    
+    # Send request to server
+    print("Sending document to server for summarization...")
+    response = requests.post(url, json=sample_document)
+    
+    if response.status_code == 200:
+        print("Request accepted. Response:", response.json())
+        
+        # Save values for checking results later
+        return {
+            "db_name": sample_document["arango_db_name"],
+            "doc_id": sample_document["arango_id"]
+        }
+    else:
+        print(f"Error: {response.status_code}")
+        print(response.text)
+        return None
+
+def test_summarize_chunks():
+    """
+    Test the chunk summarization functionality directly by creating a sample document with chunks.
+    
+    In a real application, you'd typically query the results from the database after processing.
+    """
+    print("\nTesting chunk summarization example...")
+    
+    # Sample document with chunks
+    sample_document_with_chunks = {
+        "arango_doc": {
+            "text": "",
+            "chunks": [
+                {
+                    "text": "Climate change has significantly affected marine ecosystems worldwide, with coral reefs being among the most vulnerable. Rising sea temperatures have led to increased coral bleaching events.",
+                    "pages": [1]
+                },
+                {
+                    "text": "Studies by Smith et al. [1] show that even a 1-2°C increase in water temperature can trigger mass bleaching events. Additionally, ocean acidification makes it difficult for corals to build their calcium carbonate skeletons.",
+                    "pages": [1, 2]
+                }
+            ]
+        },
+        "arango_db_name": "test_db",
+        "arango_id": "interviews/test_interview",
+        "is_sci": False
+    }
+    
+    url = "http://localhost:8100/summarise_document"
+    print("Sending document with chunks for summarization...")
+    response = requests.post(url, json=sample_document_with_chunks)
+    
+    if response.status_code == 200:
+        print("Request accepted. Response:", response.json())
+        return {
+            "db_name": sample_document_with_chunks["arango_db_name"],
+            "doc_id": sample_document_with_chunks["arango_id"]
+        }
+    else:
+        print(f"Error: {response.status_code}")
+        print(response.text)
+        return None
+
+def poll_for_results(doc_info, max_retries=10, wait_time=5):
+    """
+    Poll the ArangoDB database to check if the document has been summarized.
+    
+    Args:
+        doc_info (dict): Dictionary containing db_name and doc_id
+        max_retries (int): Maximum number of polling attempts
+        wait_time (int): Time to wait between polling attempts (seconds)
+        
+    Returns:
+        dict or None: The document with summaries if available, None otherwise
+    """
+    if not doc_info:
+        return None
+        
+    db_name = doc_info["db_name"]
+    doc_id = doc_info["doc_id"]
+    
+    print(f"\nPolling for results in {db_name}/{doc_id}...")
+    
+    arango = ArangoDB(db_name=db_name)
+    
+    for attempt in range(max_retries):
+        print(f"Attempt {attempt+1}/{max_retries}...")
+        
+        try:
+            # Get the document from ArangoDB
+            document = arango.get_document(doc_id)
+            
+            # Check if the document has been summarized
+            if document and "summary" in document:
+                print("✓ Document summary found!")
+                print("-" * 50)
+                print("Document Summary:")
+                print("-" * 50)
+                print(document["summary"]["text_sum"])
+                print("-" * 50)
+                
+                # Check if chunks have been summarized
+                if "chunks" in document and document["chunks"] and "summary" in document["chunks"][0]:
+                    print("✓ Chunk summaries found!")
+                    print("-" * 50)
+                    print("First Chunk Summary:")
+                    print("-" * 50)
+                    print(document["chunks"][0]["summary"])
+                    print("-" * 50)
+                    if len(document["chunks"]) > 1:
+                        print("Tags:", document["chunks"][0]["tags"])
+                    
+                return document
+                
+            # If we haven't found summaries yet, wait and try again
+            time.sleep(wait_time)
+            
+        except Exception as e:
+            print(f"Error checking document: {e}")
+            time.sleep(wait_time)
+    
+    print("❌ Summarization not completed after maximum retries.")
+    return None
+
+if __name__ == "__main__":
+    print("LLM Server Test Script")
+    print("=====================\n")
+    
+    # Test if server is running
+    try:
+        requests.get("http://localhost:8100")
+        print("Server is running at http://localhost:8100\n")
+    except requests.exceptions.ConnectionError:
+        print("ERROR: Cannot connect to server at http://localhost:8100")
+        print("Make sure the server is running before continuing.\n")
+        exit(1)
+    
+    # Run tests and store document info for polling
+    doc1_info = test_summarize_document()
+    time.sleep(2)  # Brief pause between tests
+    doc2_info = test_summarize_chunks()
+    
+    print("\nWaiting for background tasks to complete...")
+    print("This may take some time depending on LLM response speed.")
+    
+    # Poll for results (with longer wait time for the first document which needs to be chunked)
+    poll_for_results(doc1_info, max_retries=20, wait_time=6)
+    poll_for_results(doc2_info, max_retries=12, wait_time=5)
+    
+    print("\nTest script completed.")
+    print("If you didn't see results, the background tasks might still be processing.")
+    print("You can run this script again later to check, or query the database directly.")
--- a/test_ollama_client.py
+++ b/test_ollama_client.py
@ -1,38 +0,0 @@
-import os
-import base64
-from ollama import Client, ChatResponse
-import env_manager
-from colorprinter.print_color import *
-import httpx
-
-env_manager.set_env()
-
-# Encode the credentials
-auth = httpx.BasicAuth(
-    username='lasse', password=os.getenv("LLM_API_PWD_LASSE")
-)
-client = httpx.Client(auth=auth)
-client = Client(
-    host="http://localhost:11434", 
-    headers={
-        "X-Chosen-Backend": "backend_ollama"  # Add this header to specify the chosen backend
-    },
-    auth=auth
-)
-response = client.chat(
-    model=os.getenv("LLM_MODEL"),
-    messages=[
-        {
-            "role": "user",
-            "content": "Why is the sky blue?",
-        },
-    ],
-)
-
-# Print the response headers
-
-# Print the chosen backend from the headers
-print("Chosen Backend:", response.headers.get("X-Chosen-Backend"))
-
-# Print the response content
-print(response)
--- a/test_ollama_image.py
+++ b/test_ollama_image.py
@ -1,9 +0,0 @@
-from _llm import LLM
-
-llm = LLM()
-
-image = '/home/lasse/sci/test_image.png'
-image_bytes = open(image, 'rb').read()
-print(type(image_bytes))
-response = llm.generate('What is this?', images=[image_bytes])
-print(response)
--- a/test_research.py
+++ b/test_research.py
@ -1,206 +0,0 @@
-from _llm import LLM
-from _arango import ArangoDB
-from _chromadb import ChromaDB
-from streamlit_chatbot import Bot
-from pydantic import BaseModel, Field
-from typing import Dict, List, Tuple
-from colorprinter.print_color import *
-from projects_page import Project
-from _base_class import StreamlitBaseClass
-from prompts import get_tools_prompt    
-
-class ResearchBase(Bot):
-    def __init__(self, username, **args):
-        super().__init__(username=username, **args)
-        self.llm = LLM()
-        self.arango = ArangoDB()
-        self.chromadb = ChromaDB()
-        self.messages = []
-
-    def start(self):
-        self.messages = [{"role": "system", "message": self.llm.system_message}]
-        if self.llm.model in ["small", "standard", "vision", "reasoning", "tools"]:
-            self.llm.get_model(self.llm.model)
-
-
-class ResearchManager(ResearchBase):
-    def __init__(self, username, project=None):
-        super().__init__(username=username, project=project)
-        self.llm.system_message = "You are an assistant helping a journalist writing a report based on extensive research."
-        self.llm.model = "reasoning"
-        self.start()
-
-    def generate_plan(self, question):
-        query = f"""
-        A journalist wants to get a report that answers this question: "{question}"
-        THIS IS *NOT* A QUESTION YOU CAN ANSWER! Instead, you need to make a plan for how to answer this question.
-        Include what type of information you need from what available sources.
-        Available sources are:
-        - Scientific articles
-        - Other articles the journalists has gathered, such as blog posts, news articles, etc.
-        - The journalists own notes.
-        - Transcribed interviews (already done, you can't produce new ones).
-        All of the above sources are available in a database, but you need to specify what you need. Be as precise as possible.
-        As you don't have access to the sources, include steps to retrieve excerpts from articles and retrieve those that might be interesting.
-        Also include steps to verify the information.
-        Make the plan easy to follow and structured. 
-        Remember: You are not answering the question, you are making *a plan* for how to answer the question using the available sources.
-        """
-        query += f"\nTo help you understand the subject, here is a summary of notes the journalist has done: {project.notes_summary}"
-        query += """Please structure the plan like:
-        ## Step 1:
-        - Task1: Description of task
-        - Task2: Description of task
-        ## Step 2:
-        - Task1: Description of task
-        - Task2: Description of task
-        Etc, with as many steps and tasks as needed.
-        """
-        return self.llm.generate(query).content
-
-
-class ResearchAssistant(ResearchBase):
-    def __init__(self, username):
-        super().__init__(username)
-        self.llm.system_message = "You are a Research Assistant"
-        self.start()
-
-
-class HelperBot(ResearchBase):
-    def __init__(self, username):
-        super().__init__(username)
-        self.llm.system_message = "You are helping a researcher to structure a text. You will get a text and make it into structured data. Make sure not to change the meaning of the text and keeps all the details in the subtasks."
-        self.llm.model = "small"
-        self.start()
-
-    def make_structured_plan(self, text, question=None):
-
-        class Plan(BaseModel):
-            steps: Dict[str, List[Tuple[str, str]]] = Field(
-                description="Structured plan represented as steps with their corresponding tasks or facts",
-                example={
-                    "Step 1: Gather Existing Materials": [
-                        ("Task 1", "Description of task"),
-                        ("Task 2", "Description of task"),
-                    ],
-                    "Step 2: Extract Relevant Information": [
-                        ("Task 1", "Description of task"),
-                        ("Task 2", "Description of task"),
-                    ],
-                },
-            )
-
-        if question:
-            query = f''' This is a proposed plan for how to write a report on "{question}":\n"""{text}"""\nPlease make the plan into structured data with subtasks. Make sure to keep all the details in the subtasks.'''
-        else:
-            query = f''' This is a proposed plan for how to write a report:\n"""{text}"""\nPlease make the plan into structured data with subtasks. Make sure to keep all the details in the subtasks.'''
-        response = self.llm.generate(query, format=Plan.model_json_schema())
-        print(response)
-        structured_response = Plan.model_validate_json(response.content)
-        print('PLAN')
-        print_rainbow(structured_response)
-        print()
-        return structured_response
-
-
-class ToolBot(ResearchBase):
-    def __init__(self, username, tools: list):
-        super().__init__(username, tools=tools)
-        self.start()
-        tools_names = [tool.__name__ for tool in self.tools]
-        tools_name_string = "\n– ".join(tools_names)
-        self.llm = LLM(
-            temperature=0,
-            system_message=f"""
-            You are an helpful assistant with tools. The tools you can choose from are:
-            {tools_name_string}
-            Your task is to choose one or multiple tools to answering a user's query.
-            DON'T come up with your own tools, only use the ones provided.
-            """,
-            chat=False,
-            model="tools",
-        )
-
-    def propose_tools(self, task):
-        query = f"""What tool(s) would you use to help with this task:
-        "{task}"
-        Answer in a structured way using the tool_calls field!
-        """
-        query = get_tools_prompt(task)
-        response = self.llm.generate(query)
-        print_yellow('Model:', self.llm.model)
-        print_rainbow(response)
-        return response.tool_calls
-
-if __name__ == "__main__":
-
-    base = StreamlitBaseClass(username="lasse")
-    project = Project(
-        username="lasse",
-        project_name="Monarch butterflies",
-        user_arango=base.get_arango(),
-    )
-    rm = ResearchManager(username="lasse", project=project)
-    tb = ToolBot(
-        username="lasse",
-        tools=[
-            "fetch_science_articles_tool",
-            "fetch_notes_tool",
-            "fetch_other_documents_tool",
-            "fetch_science_articles_and_other_documents_tool",
-        ]
-    )
-    # ra = ResearchAssistant(username="lasse")
-    hb = HelperBot(username="lasse")
-
-    question = "Tell me five interesting facts about the Monarch butterfly"
-
-    # Generate plan
-    plan = rm.generate_plan(question)
-# -- Example of what a plan can look like --
-# plan = """## Step-by-Step Plan for Answering the Question: "Tell Me Five Interesting Facts About the Monarch Butterfly"
-
-# ### Step 1: Gather and Organize Existing Materials
-# - **Task 1:** Retrieve all existing materials related to Monarch butterflies from the database using keywords such as "Monarch butterfly migration," "habitat loss," "milkweed," "insecticides," "climate change," "Monarch Butterfly Biosphere Reserve," and "migration patterns."
-# - **Task 2:** Categorize these materials into scientific articles, other articles (blogs, news), own notes, and transcribed interviews for easy access.
-
-# ### Step 2: Extract Relevant Excerpts
-# - **Task 1:** From the retrieved scientific articles, extract information on migration patterns, genetic studies, and population decline factors.
-# - **Task 2:** From blogs and news articles, look for interesting anecdotes or recent findings about conservation efforts and unique behaviors of Monarch butterflies.
-
-# ### Step 3: Identify Potential Interesting Facts
-# - **Task 1:** Review the extracted excerpts to identify potential facts such as migration patterns, threats faced by Monarchs, population decline statistics, conservation efforts, and unique behaviors.
-# - **Task 2:** Compile a list of five compelling and accurate facts based on the extracted information.
-
-# ### Step 4: Verify Information
-# - **Task 1:** Cross-check each fact with multiple sources to ensure accuracy. For example, verify migration details across scientific articles and recent news reports.
-# - **Task 2:** Look for consensus among sources regarding population trends and threats to Monarchs.
-
-# ### Step 5: Structure the Report
-# - **Task 1:** Organize the five selected facts into a coherent structure, ensuring each fact is clearly explained and engaging.
-# - **Task 2:** Incorporate quotes or statistics from sources to add depth and credibility to each fact.
-
-# ### Step 6: Review and Finalize
-# - **Task 1:** Proofread the report for clarity, accuracy, and grammar.
-# - **Task 2:** Ensure all information is presented in an engaging manner suitable for a journalistic report.
-
-# This plan ensures that the journalist systematically gathers, verifies, and presents five interesting facts about Monarch butterflies, providing a comprehensive and accurate report. 
-#     """
-    #print_blue(plan)
-    if "</think>" in plan:
-        plan = plan.split("</think>")[1]
-
-    # Make structured plan
-    structured_plan = hb.make_structured_plan(plan, question)
-
-
-    for step, tasks in structured_plan.steps.items():
-        print_blue("\n### Step:", step)
-        for task in tasks:
-
-            print_blue("Task:", task[0])
-            print_yellow(task[1])
-
-            tools = tb.propose_tools(task[1])
-            print_green("Tools:", tools)
-            print('\n')
--- a/test_server.py
+++ b/test_server.py
@ -0,0 +1,123 @@
+import requests
+import json
+import time
+
+def test_summarize_document():
+    """
+    Test the document summarization functionality of the LLM server by sending a POST request
+    to the summarize_document endpoint.
+    
+    This function creates a sample document, sends it to the LLM server, and then polls for results.
+    """
+    print("Testing document summarization...")
+    
+    # Define server endpoint
+    url = "http://localhost:8100/summarise_document"
+    
+    # Create a sample document
+    sample_document = {
+        "arango_doc": {
+            "text": """
+            The Impact of Climate Change on Coral Reefs
+            
+            Climate change has significantly affected marine ecosystems worldwide, with coral reefs being among the most vulnerable. 
+            Rising sea temperatures have led to increased coral bleaching events, where corals expel their symbiotic algae, 
+            leading to whitening and potential death. Studies show that even a 1-2°C increase in water temperature 
+            can trigger mass bleaching events. Additionally, ocean acidification caused by increased CO2 absorption 
+            makes it difficult for corals to build their calcium carbonate skeletons. 
+            
+            Recent research by Johnson et al. (2023) suggests that if current trends continue, we may lose up to 90% 
+            of coral reefs by 2050. However, some corals have shown remarkable resilience. Certain species can adapt 
+            to higher temperatures through a process called adaptive bleaching, where they exchange their algal symbionts 
+            for more heat-tolerant varieties. Conservation efforts focused on cultivating these resilient species may 
+            provide hope for reef preservation.
+            """,
+            "chunks": []
+        },
+        "arango_db_name": "test_db",
+        "arango_id": "articles/test_article",
+        "is_sci": True
+    }
+    
+    # Send request to server
+    print("Sending document to server for summarization...")
+    response = requests.post(url, json=sample_document)
+    
+    if response.status_code == 200:
+        print("Request accepted. Response:", response.json())
+        
+        # In a real-world scenario, you might poll the database to see when the summary is ready
+        print("Note: In a real implementation, you would check the database for results.")
+        print("Since this is just a test, we're showing how the request works.")
+        
+        return True
+    else:
+        print(f"Error: {response.status_code}")
+        print(response.text)
+        return False
+
+def test_summarize_chunks():
+    """
+    Test the chunk summarization functionality directly by creating a sample document with chunks.
+    
+    In a real application, you'd typically query the results from the database after processing.
+    """
+    print("\nTesting chunk summarization example...")
+    
+    # Sample document with chunks
+    sample_document_with_chunks = {
+        "arango_doc": {
+            "text": "",
+            "chunks": [
+                {
+                    "text": "Climate change has significantly affected marine ecosystems worldwide, with coral reefs being among the most vulnerable. Rising sea temperatures have led to increased coral bleaching events.",
+                    "pages": [1]
+                },
+                {
+                    "text": "Studies by Smith et al. [1] show that even a 1-2°C increase in water temperature can trigger mass bleaching events. Additionally, ocean acidification makes it difficult for corals to build their calcium carbonate skeletons.",
+                    "pages": [1, 2]
+                }
+            ]
+        },
+        "arango_db_name": "test_db",
+        "arango_id": "interviews/test_interview",
+        "is_sci": False
+    }
+    
+    # In a real implementation, you would:
+    # 1. Send this document to the server
+    # 2. Check the database later to see the summarized chunks
+    
+    url = "http://localhost:8100/summarise_document"
+    print("Sending document with chunks for summarization...")
+    response = requests.post(url, json=sample_document_with_chunks)
+    
+    if response.status_code == 200:
+        print("Request accepted. Response:", response.json())
+        return True
+    else:
+        print(f"Error: {response.status_code}")
+        print(response.text)
+        return False
+
+if __name__ == "__main__":
+    print("LLM Server Test Script")
+    print("=====================\n")
+    
+    # Test if server is running
+    try:
+        requests.get("http://localhost:8100")
+        print("Server is running at http://localhost:8100\n")
+    except requests.exceptions.ConnectionError:
+        print("ERROR: Cannot connect to server at http://localhost:8100")
+        print("Make sure the server is running before continuing.\n")
+        exit(1)
+    
+    # Run tests
+    test_summarize_document()
+    time.sleep(2)  # Brief pause between tests
+    test_summarize_chunks()
+    
+    print("\nTest script completed. Check your ArangoDB instance for results.")
+    print("Note: Document summarization happens in background tasks, so results may not be immediate.")
+    print("You would typically query the database to see the updated documents with summaries.")
--- a/test_tts.py
+++ b/test_tts.py
@ -1,45 +0,0 @@
-import torch
-from TTS.api import TTS
-from datetime import datetime
-# Get device
-from TTS.tts.utils.speakers import SpeakerManager
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-
-# Init TTS
-tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
-
-
-exit()
-
-
-
-
-text = """Hi there, thanks for having me! My interest in electric cars really started back when I was a teenager. I remember learning about the history of EVs and how they've been around since the late 1800s, even before gasoline cars took over. The fact that these vehicles could run on electricity instead of fossil fuels just fascinated me.
-
-Then, in the 90s, General Motors introduced the EV1 - it was a real game-changer. It showed that electric cars could be practical and enjoyable to drive. And when Tesla came along with their Roadster in 2007, proving that EVs could have a long range, I was hooked.
-
-But what really sealed my interest was learning about the environmental impact of EVs. They produce zero tailpipe emissions, which means they can help reduce air pollution and greenhouse gas emissions. That's something I'm really passionate about.
-"""
-text_se = """Antalet bilar ger dock bara en del av bilden. För att förstå bilberoendet bör vi framför allt titta på hur mycket bilarna faktiskt används.
-Stockholmarnas genomsnittliga körsträcka med bil har minskat sedan millennieskiftet. Den är dock lägre i Göteborg och i Malmö.
-I procent har bilanvändningen sedan år 2000 minskat lika mycket i Stockholm och Malmö, 9 procent. I Göteborg är minskningen 13 procent, i riket är minskningen 7 procent."""
-# Run TTS
-# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
-# Text to speech list of amplitude values as output
-#wav = tts.tts(text=text, speaker_wav="my/cloning/audio.wav", language="en")
-# Text to speech to a file
-time_now = datetime.now().strftime("%Y%m%d%H%M%S")
-output_path = f"output/tts_{time_now}.wav"
-tts.tts_to_file(text=text, speaker_wav='voices/test/test_en.wav', language="en", file_path=output_path)
-
-
-
-
-# api = TTS("tts_models/se/fairseq/vits")
-
-# api.tts_with_vc_to_file(
-#     text_se,
-#     speaker_wav="test_audio_se.wav",
-#     file_path="output_se.wav"
-# )
--- a/test_tts_call_server.py
+++ b/test_tts_call_server.py
@ -1,22 +0,0 @@
-import requests
-
-# Define the server URL
-server_url = "http://localhost:5002/api/tts"
-
-# Define the payload
-payload = {
-    "text": "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
-    "speaker": "Ana Florence",
-    "language": "en",
-    "split_sentences": True
-}
-
-# Send the request to the TTS server
-response = requests.post(server_url, json=payload)
-
-# Save the response audio to a file
-if response.status_code == 200:
-    with open("output.wav", "wb") as f:
-        f.write(response.content)
-else:
-    print(f"Error: {response.status_code}")
--- a/tts_save_speaker.py
+++ b/tts_save_speaker.py
@ -1,33 +0,0 @@
-from TTS.tts.configs.tortoise_config import TortoiseConfig
-from TTS.tts.models.tortoise import Tortoise
-import torch
-import os
-import torchaudio
-
-# Initialize Tortoise model
-config = TortoiseConfig()
-model = Tortoise.init_from_config(config)
-model.load_checkpoint(config, checkpoint_dir="tts_models/en/multi-dataset/tortoise-v2", eval=True)
-
-# Move model to GPU if available
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print(device)
-model.to(device)
-
-# Define the text and voice directory
-text = "There is, therefore, an increasing need to understand BEVs from a systems perspective."
-voice_dir = "voices"
-speaker = "test"
-
-# Load voice samples
-voice_samples = []
-for file_name in os.listdir(os.path.join(voice_dir, speaker)):
-    file_path = os.path.join(voice_dir, speaker, file_name)
-    waveform, sample_rate = torchaudio.load(file_path)
-    voice_samples.append(waveform)
-
-# Get conditioning latents
-conditioning_latents = model.get_conditioning_latents(voice_samples)
-
-# Save conditioning latents to a file
-torch.save(conditioning_latents, "conditioning_latents.pth")
--- a/utils.py
+++ b/utils.py
@ -13,4 +13,96 @@ def fix_key(_key: str) -> str:
    Returns:
        str: The sanitized key with disallowed characters replaced by underscores.
    """
-    return re.sub(r"[^A-Za-z0-9_\-\.@()+=;$!*\'%:]", "_", _key)
+    return re.sub(r"[^A-Za-z0-9_\-\.@()+=;$!*\'%:]", "_", _key)
+
+
+
+
+def is_reference_chunk(text: str) -> bool:
+    """
+    Determine if a text chunk consists PREDOMINANTLY of references or end matter.
+    Conservative approach: only returns True for chunks that are clearly mostly references.
+    
+    Args:
+        text (str): Text chunk to analyze
+        
+    Returns:
+        bool: True if the chunk appears to be mostly references/end matter
+    """
+    # Split text into lines for analysis
+    lines = [line.strip() for line in text.split('\n') if line.strip()]
+    if not lines:
+        return False
+    
+    # First, check for unambiguous reference chunks (many DOIs or reference links)
+    doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+"
+    doi_matches = len(re.findall(doi_pattern, text))
+    refhub_matches = len(re.findall(r'http://refhub\.elsevier\.com/\S+', text))
+    
+    # If there are many DOIs or refhub links, it's almost certainly primarily references
+    if doi_matches >= 15 or refhub_matches >= 10:
+        return True
+    
+    # Find positions of common end matter section headers
+    end_matter_patterns = [
+        r"\*\*Credit author statement\*\*", 
+        r"\*\*Declaration of competing interest\*\*",
+        r"\*\*Acknowledgment\*\*", 
+        r"\*\*Acknowledgement\*\*",
+        r"\*\*Appendix\b.*\*\*",
+        r"\*\*References\*\*",
+        r"^References[\s]*$"
+    ]
+    
+    # Try to identify where end matter begins
+    end_matter_positions = []
+    for pattern in end_matter_patterns:
+        matches = list(re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE))
+        for match in matches:
+            end_matter_positions.append(match.start())
+    
+    # If we found end matter sections
+    if end_matter_positions:
+        # Find the earliest end matter position
+        first_end_matter = min(end_matter_positions)
+        # Calculate ratio of substantive content
+        substantive_ratio = first_end_matter / len(text)
+        
+        # If less than 30% of the chunk is substantive content, filter it
+        # This is conservative - only filter if the chunk is predominantly end matter
+        if substantive_ratio < 0.10:
+            return True
+        else:
+            # There's significant substantive content before end matter
+            return False
+    
+    # Count reference indicators
+    reference_indicators = 0
+    
+    # Citation patterns with year, volume, pages
+    citation_patterns = len(re.findall(r'\d{4};\d+:\d+[-–]\d+', text))
+    reference_indicators += citation_patterns * 2
+    
+    # Check for lines starting with citation numbers
+    lines_starting_with_citation = 0
+    for line in lines:
+        if re.match(r'^\s*\[\d+\]', line):
+            lines_starting_with_citation += 1
+    
+    # If more than half the lines start with reference numbers, it's a reference list
+    if lines_starting_with_citation > len(lines) / 2:
+        return True
+    
+    # Check for abbreviation list (only if it makes up most of the chunk)
+    abbreviation_lines = 0
+    for line in lines:
+        if re.match(r'^[A-Z0-9]{2,}\s+[A-Z][a-z]+', line):
+            abbreviation_lines += 1
+    
+    # If more than 70% of lines are abbreviations, it's an abbreviation list
+    if abbreviation_lines > len(lines) * 0.7:
+        return True
+    
+    # Conservative approach: only filter if it's clearly mostly references
+    return False
+
--- a/view_latest_results.py
+++ b/view_latest_results.py
@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+"""
+View Latest LLM Server Results
+
+This script displays the latest document summaries generated by the LLM server
+directly in the terminal, providing a quick way to check results without
+having to use a web browser.
+
+Usage:
+    python view_latest_results.py [--raw] [--json]
+
+Options:
+    --raw   Display the raw result data
+    --json  Format the output as JSON
+"""
+
+import json
+import os
+import sys
+import argparse
+from datetime import datetime
+
+
+def load_latest_result():
+    """Load the latest result from the JSON file."""
+    latest_result_file = os.path.join(os.path.dirname(__file__), "latest_summary_result.json")
+    try:
+        if os.path.exists(latest_result_file):
+            with open(latest_result_file, 'r') as f:
+                return json.load(f)
+        else:
+            print(f"No results file found at {latest_result_file}")
+            return None
+    except Exception as e:
+        print(f"Error loading results: {e}")
+        return None
+
+
+def display_raw(result):
+    """Display the raw result data."""
+    print(json.dumps(result, indent=2))
+
+
+def display_formatted(result):
+    """Display the result in a nicely formatted way."""
+    if not result:
+        print("No results available")
+        return
+
+    print("\n" + "=" * 80)
+    print(f"DOCUMENT: {result.get('_id', 'Unknown')}")
+    print("=" * 80)
+    
+    # Document summary
+    summary = result.get("summary", {}).get("text_sum", "No summary available")
+    print("\n📄 DOCUMENT SUMMARY")
+    print("-" * 80)
+    print(summary)
+    
+    # Model info if available
+    if "summary" in result and "meta" in result["summary"]:
+        meta = result["summary"]["meta"]
+        model = meta.get("model", "Unknown")
+        temp = meta.get("temperature", "Unknown")
+        print(f"\nGenerated using: {model} (temperature: {temp})")
+    
+    # Display chunks
+    chunks = result.get("chunks", [])
+    if chunks:
+        summarized_chunks = [chunk for chunk in chunks if "summary" in chunk]
+        print(f"\n🧩 CHUNK SUMMARIES ({len(summarized_chunks)}/{len(chunks)} chunks processed)")
+        
+        for i, chunk in enumerate(summarized_chunks):
+            print("\n" + "-" * 80)
+            print(f"Chunk {i+1}:")
+            print("-" * 80)
+            print(chunk["summary"])
+            
+            # Display tags
+            if "tags" in chunk and chunk["tags"]:
+                print("\nTags:", ", ".join(chunk["tags"]))
+            
+            # Display references
+            if "references" in chunk and chunk["references"]:
+                print("\nReferences:")
+                for ref in chunk["references"]:
+                    print(f"- {ref}")
+    
+    print("\n" + "=" * 80)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='View latest LLM server results')
+    parser.add_argument('--raw', action='store_true', help='Display raw result data')
+    parser.add_argument('--json', action='store_true', help='Format output as JSON')
+    args = parser.parse_args()
+    
+    result = load_latest_result()
+    
+    if not result:
+        print("No results available")
+        return
+    
+    if args.raw or args.json:
+        display_raw(result)
+    else:
+        display_formatted(result)
+
+
+if __name__ == "__main__":
+    main()