Add initial implementation of RSS analyzer and ArangoDB integration

1 year ago · 00fd42b32d
parent 01df43bba2
commit 00fd42b32d
19 changed files with 1474 additions and 249 deletions
--- a/.env
+++ b/.env
@ -0,0 +1,21 @@
 # Chroma
 CHROMA_CLIENT_AUTH_CREDENTIALS="overpass-alms-porker-file-seigneur-kiln"
 CHROMA_SERVER_AUTHN_PROVIDER="chromadb.auth.basic_authn.BasicAuthenticationServerProvider"
 CHROMA_AUTH_TOKEN_TRANSPORT_HEADER="X-Chroma-Token"
 CHROMA_HOST="http://192.168.1.10:8007"
 CHROMA_CLIENT_AUTH_CREDENTIALS="overpass-alms-porker-file-seigneur-kiln"
 CHROMA_SERVER_AUTHN_PROVIDER="chromadb.auth.basic_authn.BasicAuthenticationServerProvider"
 CHROMA_AUTH_TOKEN_TRANSPORT_HEADER="X-Chroma-Token"
 _CHROMA_HOST="https://lasseedfast.se/chroma_ev_cars/"
 # Arango
 ARANGO_HOST="http://192.168.1.10:8531"
 ARANGO_USER="admin"
 ARANGO_PASSWORD="raHzaw-5vyjqo-xisfec"
 ARANGO_DB="base"
 ARANGO_PWD_ENV_MANAGER="jagskoterenv(Y)"
 ARANGO_ROOT_USER='root'
 ARANGO_ROOT_PASSWORD='gyhqed-kiwNac-9buhme'
 MAILERSEND_API_KEY="mlsn.71de3eb2dbcb733bd4ee509d1c95ccfc8939fd647cba9e3a0f631f60f900bd85"
--- a/init.py
+++ b/init.py
@ -0,0 +1 @@
 from pdf_highlighter import Highlighter
--- a/_base_class.py
+++ b/_base_class.py
@ -13,6 +13,7 @@ class BaseClass:
        self.project_name: str = kwargs.get('project_name', None)
        self.collection: str = kwargs.get('collection_name', None)
        self.user_arango: ArangoDB = self.get_arango()
        self.base_arango: ArangoDB = self.get_arango(admin=True)
    def get_arango(self, admin: bool = False, db_name: str = None) -> ArangoDB:
@ -128,7 +129,7 @@ class BaseClass:
            self.file_path = f"sci_articles/{self.doi}.pdf".replace("/", "_")
            return os.path.exists(self.file_path)
        else:
-            file_path = f"{self.download_folder}/{filename}"
+            file_path = f"{download_folder}/{filename}"
            while os.path.exists(file_path + ".pdf"):
                if not re.search(r"(_\d+)$", file_path):
                    file_path += "_1"
--- a/_chromadb.py
+++ b/_chromadb.py
@ -3,7 +3,6 @@ import os
 from chromadb.config import Settings
 from dotenv import load_dotenv
 from colorprinter.print_color import *
 load_dotenv(".env")
@ -96,12 +95,14 @@ class ChromaDB:
                    if k not in r["included"]:
                        continue
                    result[k][0] = v[0][: n_results - (n_sources - len(sources))]
-            if "_id" in where:
+            if where and "_id" in where:
                where["_id"]["$in"] = [
                    i for i in where["_id"]["$in"] if i not in sources
                ]
                if where["_id"]["$in"] == []:
                    break
            else:
                break
        return result
@ -109,7 +110,8 @@ if __name__ == "__main__":
    from colorprinter.print_color import *
    chroma = ChromaDB()
-
+    print(chroma.db.list_collections())
    exit()
    result = chroma.query(
        query="What is Open Science)",
        collection="sci_articles",
@ -117,19 +119,5 @@ if __name__ == "__main__":
        n_sources=3,
        max_retries=4,
    )
-    print(result)
+    print_rainbow(result['metadatas'][0])
    exit()
    all = chroma_collection.get()
    ids = all.get("ids", [])
    metadatas = all.get("metadatas", [])
    combined_list = list(zip(ids, metadatas))
    ids = []
    metadatas = []
    for id, metadata in combined_list:
        ids.append(id)
        metadata["_id"] = f"sci_articles/{metadata['_key']}"
        metadatas.append(metadata)
    chroma_collection.update(ids=ids, metadatas=metadatas)
--- a/_classes.py
+++ b/_classes.py
@ -1,22 +1,28 @@
 # streamlit_pages.py
-
+import os
 import feedparser
 import urllib
 from urllib.parse import urljoin
 import requests
 import re
 from bs4 import BeautifulSoup
 import streamlit as st
 from time import sleep
 import pandas as pd
 from datetime import datetime, timedelta
 from PIL import Image
 from io import BytesIO
 import base64
 from colorprinter.print_color import *
 from article2db import PDFProcessor
-from streamlit_chatbot import Chat, EditorBot, ResearchAssistantBot, PodBot
+import feedparser
 from streamlit_chatbot import Chat, EditorBot, ResearchAssistantBot, PodBot, Bot
 from info import country_emojis
 from utils import fix_key
 from _arango import ArangoDB
 from _llm import LLM
 from _base_class import BaseClass
 from _rss import RSSReader
 from prompts import get_note_summary_prompt, get_image_system_prompt
@ -399,6 +405,12 @@ class BotChatPage(BaseClass):
                "chat": self.chat,
                "role": self.role,
            }
        else:
            bot = Bot(
                username=self.username,
                chat=Chat(username=self.username, role="Research Assistant"),
            )
            bot.run()
    def sidebar_actions(self):
        with st.sidebar:
@ -680,7 +692,6 @@ class Project(BaseClass):
    def load_project(self):
        print_blue("Project name:", self.name)
        print(self.user_arango, type(self.user_arango))
        project_cursor = self.user_arango.db.aql.execute(
            "FOR doc IN projects FILTER doc.name == @name RETURN doc",
            bind_vars={"name": self.name},
@ -927,3 +938,103 @@ class SettingsPage(BaseClass):
            self.update_settings("avatar", img_path)
            st.success("Profile picture uploaded")
            sleep(1)
 class RSSFeedsPage(BaseClass):
    def __init__(self, username: str):
        super().__init__(username=username)
        self.page_name = "RSS Feeds"
        self.reader = RSSReader(username=username)
        # Initialize attributes from session state if available
        for k, v in st.session_state.get(self.page_name, {}).items():
            setattr(self, k, v)
    def run(self):
        if "selected_feed" not in st.session_state:
            st.session_state["selected_feed"] = None
        self.update_current_page(self.page_name)
        self.display_feed()
        self.sidebar_actions()
        self.update_session_state(page_name=self.page_name)
    def select_rss_feeds(self):
        rss_feeds = self.reader.get_rss_feeds()
        if rss_feeds:
            feed_options = [feed["title"] for feed in rss_feeds]
            with st.sidebar:
                st.subheader("Show your feeds")
                selected_feed_title = st.selectbox(
                    "Select a feed", options=feed_options, index=None
                )
                if selected_feed_title:
                    st.session_state["selected_feed"] = [
                        feed["_key"]
                        for feed in rss_feeds
                        if feed["title"] == selected_feed_title
                    ][0]
                    st.rerun()
        else:
            st.write("You have no RSS feeds added.")
    def search_feeds(self, rss_url):
        with st.spinner("Discovering feeds..."):
            feeds = self.reader.discover_feeds(rss_url)
            if feeds:
                st.session_state["discovered_feeds"] = feeds
            else:
                st.error("No RSS feeds found at the provided URL.")
    def sidebar_actions(self):
        if 'discovered_feeds' not in st.session_state:
            st.session_state['discovered_feeds'] = None
        with st.sidebar:
            self.select_rss_feeds()
            st.subheader("Add a New RSS Feed")
            with st.form("add_rss_feed"):
                rss_url = st.text_input("Website URL or RSS Feed URL")
                submitted = st.form_submit_button("Discover Feeds")
                if submitted:
                    print_green(rss_url)
                    feeds = self.reader.discover_feeds(rss_url)
                    st.session_state['discovered_feeds'] = feeds
            if st.session_state["discovered_feeds"]:
                st.subheader("Select a Feed to Add")
                feeds = st.session_state["discovered_feeds"]
                feed_options = [f"{feed['title']} ({feed['href']})" for feed in feeds]
                selected_feed = st.selectbox("Available Feeds", options=feed_options)
                selected_feed_url = feeds[feed_options.index(selected_feed)]["href"]
                if st.button("Preview Feed"):
                    feed = self.reader.parse_feed(selected_feed_url)
                    st.write(f"{feed.title}")
                    description = self.reader.html_to_markdown(feed.description)
                    st.write(f"_{description}_")
                    for entry in feed.entries[:5]:
                        with st.expander(entry["title"]):
                            summary = entry.get("summary", "No summary available")
                            markdown_summary = self.reader.html_to_markdown(summary)
                            st.markdown(markdown_summary)
                    print_yellow(selected_feed_url)
                if st.button(
                    "Add RSS Feed",
                    on_click=self.reader.add_rss_feed,
                    args=[selected_feed_url],
                ):
                    del st.session_state["discovered_feeds"]
                    st.success("RSS Feed added.")
                    st.rerun()
    def display_feed(self):
        if st.session_state["selected_feed"]:
            self.reader.get_feed(st.session_state["selected_feed"])
            st.title(self.reader.feed.title)
            st.write(f"_{self.reader.feed.description}_")
            for entry in self.reader.feed.entries[:5]:
                with st.expander(entry["title"]):
                    summary = entry.get("summary", "No summary available")
                    markdown_summary = self.reader.html_to_markdown(summary)
                    st.markdown(markdown_summary)
                    st.markdown(f"[Read more]({entry['link']})")
--- a/_llm.py
+++ b/_llm.py
@ -1,3 +1,4 @@
 import re
 import os
 from typing import Literal, Optional
 import requests
@ -5,20 +6,18 @@ from requests.auth import HTTPBasicAuth
 import tiktoken
 import json
 from colorprinter.print_color import *
-import env_manager
+import asyncio
 import re
 import env_manager
 env_manager.set_env()
 tokenizer = tiktoken.get_encoding("cl100k_base")
-print(os.getenv("LLM_API_USER"), os.getenv("LLM_API_PWD_LASSE"))
+
 class LLM:
    def __init__(
        self,
        system_message="You are an assistant.",
        num_ctx=8192,
        temperature=0.01,
        model: Optional[Literal["small", "standard", "vision"]] = "standard",
        max_length_answer=4096,
@ -31,7 +30,6 @@ class LLM:
        Args:
            system_message (str): The initial system message for the assistant. Defaults to "You are an assistant.".
            num_ctx (int): The number of context tokens to use. Defaults to 4096.
            temperature (float): The temperature setting for the model's response generation. Defaults to 0.01.
            chat (bool): Flag to indicate if the assistant is in chat mode. Defaults to True.
            model (str): The model type to use. Defaults to "standard". Alternatives: 'small', 'standard', 'vision'.
@ -43,7 +41,7 @@ class LLM:
        """
        self.model = self.get_model(model)
        self.system_message = system_message
-        self.options = {"temperature": temperature, "num_ctx": num_ctx}
+        self.options = {"temperature": temperature}
        self.messages = messages or [{"role": "system", "content": self.system_message}]
        self.max_length_answer = max_length_answer
        self.chat = chat
@ -70,71 +68,115 @@ class LLM:
        return int(num_tokens)
    def read_stream(self, response):
        """
        Reads a stream of data from the given response object and yields the content of each message.
        Args:
            response (requests.Response): The response object to read the stream from.
        Yields:
            str: The content of each message in the stream.
        Notes:
            - The response is expected to provide data in chunks, which are decoded as UTF-8.
            - Lines are split by newline characters.
            - Each line is expected to be a JSON object containing a "message" key with a "content" field.
            - If a chunk cannot be decoded as UTF-8, it is skipped.
            - If a line cannot be parsed as JSON, it is skipped.
        """
        buffer = ""
        message = ""
        first_chunk = True
        prev_content = None  # Store the previous content chunk
        for chunk in response.iter_content(chunk_size=64):
            if chunk:
                try:
                    message_part = chunk.decode("utf-8")
                    buffer += message_part
                    message += message_part
                except UnicodeDecodeError:
                    continue
                while "\n" in buffer:
                    line, buffer = buffer.split("\n", 1)
-                    if line:
+                    if line.strip():
                        try:
                            json_data = json.loads(line)
-                            yield json_data["message"]["content"]
+                            content = json_data["message"]["content"]
                            done = json_data.get("done", False)
                            # Remove leading '"' from the first content
                            if first_chunk and content.startswith('"'):
                                content = content[1:]
                            first_chunk = False 
                            if done:
                                # If the last content ends with '"', remove it
                                if prev_content and prev_content.endswith('"'):
                                    prev_content = prev_content[:-1]
                                # Yield the last content
                                if prev_content:
                                    yield prev_content
                                break
                            else:
                                # Yield the previous content before storing the current
                                if prev_content:
                                    yield prev_content
                                prev_content = content
                        except json.JSONDecodeError:
                            continue
        # Append the full message without leading/trailing quotes
        self.messages.append({"role": "assistant", "content": message.strip('"')})
    def make_summary(self, text):
        data = {
            "messages": [
                {
                    "role": "system",
                    "content": """You are summarizing a text. Make it detailed and concise. Answer ONLY with the summary. Don't add any new information.""",
                },
                {
                    "role": "user",
                    "content": f'Summarise the text below:\n"""{text}"""\nRemember to be concise and detailed. Answer in English.',
                },
            ],
            "stream": False,
            "keep_alive": 3600 * 24 * 7,
            "model": self.get_model("small"),
            "options": {"temperature": 0.01},
        }
        response = requests.post(
            os.getenv("LLM_API_URL"),
            json=data,
            auth=HTTPBasicAuth(
                os.getenv("LLM_API_USER"), os.getenv("LLM_API_PWD_LASSE")
            ),
        )
        print_blue("Summary:", response.json()["message"]["content"])
        return response.json()["message"]["content"]
    def generate(
        self,
-        query,
+        query: str = None,
-        stream=False,
+        user_input: str = None,
-        tools=None,
+        context: str = None,
-        function_call=None,
+        stream: bool = False,
        tools: list = None,
        function_call: dict = None,
        images: list = None,
        model: Optional[Literal["small", "standard", "vision"]] = None,
-        temperature=None,
+        temperature: float = None,
    ):
        """
-        Generates a response from the language model based on the provided query and options.
+        Generates a response from the language model based on the provided inputs.
        If user_input is provided, it is included in the message history instead of the query.
        If context is provided, it is summaried if len() > 2000 and included in the message history.
        Args:
-            query (str): The input query to be processed by the language model.
+            query (str, optional): The main query string to be processed by the model.
            user_input (str, optional): User input to be included in the message history.
            context (str, optional): Contextual information to be included in the message history.
            stream (bool, optional): Whether to stream the response. Defaults to False.
-            tools (list, optional): A list of tools to be used by the language model. Defaults to None.
+            tools (list, optional): List of tools to be included in the request.
-            function_call (dict, optional): A dictionary specifying a function call to be made by the language model. Defaults to None.
+            function_call (dict, optional): Dictionary specifying a function call to be made.
-            images (list, optional): A list of image paths or base64-encoded images to be included in the request. Defaults to None.
+            images (list, optional): List of image paths or base64-encoded images to be included.
-            model (str, optional): The model alias to be used for generating the response. Defaults to None. Alternatives: 'small', 'standard', 'vision'.
+            model (Optional[Literal["small", "standard", "vision"]], optional): The model type to be used. Defaults to None.
            temperature (float, optional): The temperature setting for the model. Defaults to None.
        Returns:
            str: The generated response from the language model. If streaming is enabled, returns the streamed response.
        """
        # Add custom header if large model is chosen
        model = self.get_model(model) if model else self.model
        temperature = temperature if temperature else self.options["temperature"]
        # Normalize whitespace and add the query to the messages
        query = re.sub(r"\s*\n\s*", "\n", query)
        message = {"role": "user", "content": query}
        headers = {"Content-Type": "application/json"}
@ -158,6 +200,11 @@ class LLM:
                            base64_images.append(
                                base64.b64encode(image_file.read()).decode("utf-8")
                            )
                elif isinstance(image, bytes):
                    base64_images.append(base64.b64encode(image).decode("utf-8"))
                else:
                    print_red("Invalid image type")
            message["images"] = base64_images
            # Set the Content-Type header based on the presence of images
            headers = {"Content-Type": "application/json; images"}
@ -166,17 +213,14 @@ class LLM:
            if self.chosen_backend:
                headers["X-Chosen-Backend"] = self.chosen_backend
        self.messages.append(message)
        # Set the number of tokens to be the sum of the tokens in the messages and half of the max length of the answer
        if self.chat or len(self.messages) > 15000:
            num_tokens = self.count_tokens() + self.max_length_answer / 2
-            if num_tokens < 8000 and "num_ctx" in self.options:
+            if num_tokens > 8000:
                del self.options["num_ctx"]
            else:
                model = self.get_model("large")
-                headers["X-Model-Type"] = "standard_64k"
+                headers["X-Model-Type"] = "large"
        if tools:
            stream = False
@ -197,6 +241,8 @@ class LLM:
        if function_call:
            data["function_call"] = function_call
        if data['model'] == 'small':
            headers["X-Model-Type"] = "small"
        response = requests.post(
            os.getenv("LLM_API_URL"),
@ -209,7 +255,21 @@ class LLM:
            timeout=3600,
        )
-        self.chosen_backend = response.headers.get('X-Chosen-Backend')
+        # If user_input is provided, change the last message to user_input and a summary of the context (if provided)
        # This needs to be done after the request to LLM for the LLM to have the original message
        if user_input:
            if context:
                if len(context) > 2000:
                    context = self.make_summary(context)
                user_input = f'''{user_input}\n\nUse the information below to answer the question.\n"""{context}"""\n[This is a summary of the context provided in the original message.]'''
                system_message_info = "\nSometimes some of the messages in the chat history are summarised, then that is clearly indicated in the message."
                if system_message_info not in self.messages[0]["content"]:
                    self.messages[0]["content"] = (
                        self.messages[0]["content"] + system_message_info
                    )
            self.messages[-1] = {"role": "user", "content": user_input}
        self.chosen_backend = response.headers.get("X-Chosen-Backend")
        if response.status_code != 200:
            print_red("Error!")
@ -233,7 +293,9 @@ class LLM:
                    result = response_json["message"]
                else:
                    result = response_json["message"]["content"].strip('"')
-                    self.messages.append({"role": "assistant", "content": result.strip('"')})
+                    self.messages.append(
                        {"role": "assistant", "content": result.strip('"')}
                    )
            except requests.exceptions.JSONDecodeError:
                print_red("Error: ", response.status_code, response.text)
                return "An error occurred."
@ -242,6 +304,33 @@ class LLM:
                self.messages = [self.messages[0]]
            return result
    async def async_generate(
        self,
        query: str = None,
        user_input: str = None,
        context: str = None,
        stream: bool = False,
        tools: list = None,
        function_call: dict = None,
        images: list = None,
        model: Optional[Literal["small", "standard", "vision"]] = None,
        temperature: float = None,
    ):
        loop = asyncio.get_event_loop()
        return await loop.run_in_executor(
            None,
            self.generate,
            query,
            user_input,
            context,
            stream,
            tools,
            function_call,
            images,
            model,
            temperature,
        )
 if __name__ == "__main__":
    llm = LLM()
--- a/_rss.py
+++ b/_rss.py
@ -0,0 +1,260 @@
 # rss_reader.py
 import feedparser
 import requests
 import urllib
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 from datetime import datetime, timedelta
 from utils import fix_key
 import os
 from _base_class import BaseClass
 from _llm import LLM
 from colorprinter.print_color import *
 class RSSFeed:
    def __init__(self):
        self.url = None
        self.title = None
        self.icon_path = None
        self.description = None
        self.feed_data = None
        self.fetched_timestamp = None
        self.entries = []
 class RSSReader(BaseClass):
    def __init__(self, username):
        super().__init__(username=username)
        self.username = username
        self.user_arango = self.get_arango(username)
        self.feed: RSSFeed = None
        self.arango_feed = None
    def discover_feeds(self, url):
        try:
            if not url.startswith("http"):
                url = "https://" + url
            # Check if the input URL is already an RSS feed
            f = feedparser.parse(url)
            if len(f.entries) > 0:
                return [
                    {
                        "href": url,
                        "title": f.feed.get("title", "No title"),
                        "icon": self.get_site_icon(url),
                    }
                ]
            # If not, proceed to discover feeds from the webpage
            raw = requests.get(url).text
            result = []
            possible_feeds = []
            html = BeautifulSoup(raw, "html.parser")
            # Find the site icon
            icon_url = self.get_site_icon(url, html)
            # Find all <link> tags with rel="alternate" and type containing "rss" or "xml"
            feed_urls = html.findAll("link", rel="alternate")
            for f in feed_urls:
                t = f.get("type", None)
                if t and ("rss" in t or "xml" in t):
                    href = f.get("href", None)
                    if href:
                        possible_feeds.append(urljoin(url, href))
            # Find all <a> tags with href containing "rss", "xml", or "feed"
            parsed_url = urllib.parse.urlparse(url)
            base = parsed_url.scheme + "://" + parsed_url.hostname
            atags = html.findAll("a")
            for a in atags:
                href = a.get("href", None)
                if href and ("rss" in href or "xml" in href or "feed" in href):
                    possible_feeds.append(urljoin(base, href))
            # Validate the possible feeds using feedparser
            for feed_url in list(set(possible_feeds)):
                f = feedparser.parse(feed_url)
                if len(f.entries) > 0:
                    result.append(
                        {
                            "href": feed_url,
                            "title": f.feed.get("title", "No title"),
                            "icon": icon_url,
                        }
                    )
            return result
        except Exception as e:
            print(f"Error discovering feeds: {e}")
            return []
    def add_rss_feed(self, url):
        self.get_feed(url)
        self.load_feed_from_url(url=url)
        self.feed._key = fix_key(self.feed.url)
        # Store feed data in base_arango's rss_feeds collection
        self.base_arango.db.collection("rss_feeds").insert(self.feed.__dict__)
        # Store a reference to the feed in user_arango's user_feeds collection
        self.user_arango.db.collection("user_feeds").insert(
            {
                "_key": self.feed._key,  # Use the same key to reference the feed
                "feed_key": self.feed._key,
                "subscribed_on": datetime.now().isoformat(),
                # Add additional user-specific fields here
            },
            overwrite=True,
        )
    def load_feed_from_url(self, url=None, data=None):
        if url:
            self.feed = RSSFeed()
            self.feed.url = url
            full_feed_data = feedparser.parse(url)
        elif data:
            self.feed = RSSFeed()
            self.feed.url = data.get("url", None)
            full_feed_data = data
        else:
            full_feed_data = feedparser.parse(self.feed.url)
        self.feed.title = full_feed_data["feed"].get("title", "No title")
        self.feed.description = full_feed_data["feed"].get(
            "description", "No description"
        )
        self.feed.icon_path = self.get_site_icon(self.feed.url)
        self.feed.entries = []
        for entry in full_feed_data["entries"]:
            self.feed.entries.append(
                {
                    "title": entry.get("title", "No title"),
                    "link": entry.get("link"),
                    "published": entry.get("published"),
                    "summary": self.html_to_markdown(
                        entry.get("summary", "No summary")
                    ),
                    "id": entry.get("id"),
                    "author": entry.get("author"),
                }
            )
        self.feed.fetched_timestamp = datetime.now().isoformat()
    def feed_data2feed(self, data):
        self.load_feed_from_url(data=data)
    def parse_feed(self, url):
        self.load_feed_from_url(url=url)
        return self.feed
    def update_feed(self):
        self.load_feed_from_url()
        # Update the feed in the database
        self.user_arango.db.collection("rss_feeds").update(
            {
                "_key": self.feed._key,
                "fetched_timestamp": self.feed.fetched_timestamp,
                "entries": self.feed.entries,
            }
        )
        return self.feed.entries
    def get_feed(self, feed_key=None, url=None, _id=None):
        if feed_key:
            arango_doc = self.base_arango.db.collection("rss_feeds").get(feed_key)
        elif url:
            arango_doc = self.base_arango.db.aql.execute(
                f"FOR doc IN rss_feeds FILTER doc.url == '{url}' LIMIT 1 RETURN doc", count=True).next()
        elif _id:
            arango_doc = self.base_arango.db.aql.execute(
                f"FOR doc IN rss_feeds FILTER doc.id == '{_id}' LIMIT 1 RETURN doc", count=True).next()
        if arango_doc:
            self.feed = RSSFeed()
            for attr in arango_doc:
                setattr(self.feed, attr, arango_doc[attr])
            fetched_time = datetime.fromisoformat(self.feed.fetched_timestamp)
            if datetime.now() - fetched_time < timedelta(hours=1):
                return self.feed.entries
            else:
                return self.update_feed()
    def get_site_icon(self, url, html=None):
        try:
            if not html:
                raw = requests.get(url).text
                html = BeautifulSoup(raw, "html.parser")
            icon_link = html.find("link", rel="icon")
            if icon_link:
                icon_url = icon_link.get("href", None)
                if icon_url:
                    return urljoin(url, icon_url)
            # Fallback to finding other common icon links
            icon_link = html.find("link", rel="shortcut icon")
            if icon_link:
                icon_url = icon_link.get("href", None)
                if icon_url:
                    return urljoin(url, icon_url)
            return None
        except Exception as e:
            print(f"Error getting site icon: {e}")
            return None
    def get_rss_feeds(self):
        return list(self.user_arango.db.collection("rss_feeds").all())
    def download_icon(self, icon_url, save_folder="external_icons"):
        try:
            if not os.path.exists(save_folder):
                os.makedirs(save_folder)
            response = requests.get(icon_url, stream=True)
            if response.status_code == 200:
                icon_name = os.path.basename(icon_url)
                icon_path = os.path.join(save_folder, icon_name)
                with open(icon_path, "wb") as f:
                    for chunk in response.iter_content(1024):
                        f.write(chunk)
                return icon_path
            else:
                print(f"Failed to download icon: {response.status_code}")
                return None
        except Exception as e:
            print(f"Error downloading icon: {e}")
            return None
    def html_to_markdown(self, html):
        soup = BeautifulSoup(html, "html.parser")
        for br in soup.find_all("br"):
            br.replace_with("\n")
        for strong in soup.find_all("strong"):
            strong.replace_with(f"**{strong.text}**")
        for em in soup.find_all("em"):
            em.replace_with(f"*{em.text}*")
        for p in soup.find_all("p"):
            p.replace_with(f"{p.text}\n\n")
        return soup.get_text()
    def get_full_content(self, url):
        result = requests.get(url)
        soup = BeautifulSoup(result.content, "html.parser")
 class RSSAnalyzer(BaseClass):
    def init(self, username):
        super().__init__(username=username)
        self.llm = LLM(system_message="You are reading RSS Feeds to analyze them.")
        self.user_arango = self.get_arango_db(username)
        self.rss_reader = RSSReader(username, self.base_arango, self.user_arango)
--- a/arango_admin.py
+++ b/arango_admin.py
@ -0,0 +1,6 @@
 from _arango import ArangoDB    
 for db in ['lasse', 'nisse', 'torill', 'irma']:
    arango  = ArangoDB(db_name=db)
    arango.db.create_collection('rss_feeds')
--- a/article2db.py
+++ b/article2db.py
@ -238,11 +238,15 @@ class Processor:
        local_chroma_deployment: bool = False,
        process: bool = True,
        document_type: str = None,
        username: str = None,
    ):
        self.document = document
        self.chromadb = ChromaDB(local_deployment=local_chroma_deployment, db=chroma_db)
        self.len_chunks = len_chunks
        self.document_type = document_type
        self.filename = filename
        self.username = username if username else document.username
        self._id = None
@ -353,7 +357,7 @@ class Processor:
            )
        else:
            chroma_collection = self.chromadb.db.get_or_create_collection(
-                "other_documents"
+                f"{self.username}__other_documents"
            )
        chroma_collection.add(ids=ids, documents=documents, metadatas=metadatas)
@ -454,11 +458,13 @@ class Processor:
            model="small",
            max_length_answer=500,
        )
        text = pymupdf4llm.to_markdown(
            self.document.pdf, page_chunks=False, show_progress=False, pages=[0, 1]
        )
        if len(self.document.pdf) == 1:
            pages = [0]
        else:
            pages = [0, 1]
        text = pymupdf4llm.to_markdown(
            self.document.pdf, page_chunks=False, show_progress=False, pages=pages
        )
        prompt = f'''
            Below is the beginning of an article. I want to know when it's published, the title, and the journal.
@ -468,7 +474,8 @@ class Processor:
            Answer ONLY with the information requested.
            I want to know the published date on the form "YYYY-MM-DD".
-            I want the full title of the article and the journal.
+            I want the full title of the article.
            I want the name of the journal/paper/outlet where the article was published.
            Be sure to answer on the form "published_date;title;journal" as the answer will be used in a CSV.
            If you can't find the information, answer "not_found".
            '''
@ -553,9 +560,10 @@ class Processor:
        if response.status_code == 200:
            data = response.json()
            if data.get("results", []) == []:
-                print(f"DOI {doi} not found in DOAJ.")
+                print_yellow(f"{doi} not found in DOAJ.")
                return False
            else:
                print_green(f"{doi} found in DOAJ.")
                return data
        else:
            print(
@ -648,11 +656,18 @@ class Processor:
                        only_meta=True
                    )
            if "_key" not in self.document.doc:
-                _key = (
+
-                    self.document.doi
+                if self.document.doi:
-                    or self.document.title
+                    _key = self.document.doi
-                    or self.document.get_title()
+                elif self.document.title:
-                )
+                    _key = self.document.title
                elif self.document.get_title():
                    _key = self.document.get_title()
                elif 'title' in self.document.doc["metadata"] and self.document.doc["metadata"]["title"]:
                    _key = self.document.doc["metadata"]["title"]
                else:
                    _key = self.document.pdf_file.name
                print_yellow(f"Document key: {_key}")
                print(self.document.doi, self.document.title, self.document.get_title())
                self.document.doc["_key"] = fix_key(_key)
--- a/manage_users.py
+++ b/manage_users.py
@ -0,0 +1,192 @@
 import yaml
 import sys
 import bcrypt
 from _arango import ArangoDB
 import os
 import dotenv
 import getpass
 import argparse
 import string
 import secrets
 from utils import fix_key
 from colorprinter.print_color import *
 dotenv.load_dotenv()
 def read_yaml(file_path):
    with open(file_path, "r") as file:
        return yaml.safe_load(file)
 def write_yaml(file_path, data):
    with open(file_path, "w") as file:
        yaml.safe_dump(data, file)
 def add_user(data, username, email, name, password):
    # Check for existing username
    if username in data["credentials"]["usernames"]:
        print(f"Error: Username '{username}' already exists.")
        sys.exit(1)
    # Check for existing email
    for user in data["credentials"]["usernames"].values():
        if user["email"] == email:
            print(f"Error: Email '{email}' already exists.")
            sys.exit(1)
    # Hash the password using bcrypt
    hashed_password = bcrypt.hashpw(password.encode("utf-8"), bcrypt.gensalt()).decode(
        "utf-8"
    )
    # Add the new user
    data["credentials"]["usernames"][username] = {
        "email": email,
        "name": name,
        "password": hashed_password,
    }
 def make_arango(username):
    root_user = os.getenv("ARANGO_ROOT_USER")
    root_password = os.getenv("ARANGO_ROOT_PASSWORD")
    arango = ArangoDB(user=root_user, password=root_password, db_name="_system")
    if not arango.db.has_database(username):
        arango.db.create_database(
            username,
            users=[
                {
                    "username": os.getenv("ARANGO_USER"),
                    "password": os.getenv("ARANGO_PASSWORD"),
                    "active": True,
                    "extra": {},
                }
            ],
        )
    arango = ArangoDB(user=root_user, password=root_password, db_name=username)
    for collection in [
        "projects",
        "favorite_articles",
        "article_collections",
        "settings",
        "chats",
        "notes",
        "other_documents",
        "rss_feeds",
    ]:
        if not arango.db.has_collection(collection):
            arango.db.create_collection(collection)
    user_arango = ArangoDB(db_name=username)
    user_arango.db.collection("settings").insert(
        {"current_page": "Bot Chat", "current_project": None}
    )
 def generate_random_password(length=16):
    characters = string.ascii_letters + string.digits
    password = "-".join(
        "".join(secrets.choice(characters) for _ in range(6)) for _ in range(3)
    )
    return password
 def delete_user(data, username):
    # Check if the user exists
    if username not in data["credentials"]["usernames"]:
        print(f"Error: Username '{username}' does not exist.")
        sys.exit(1)
    # Remove the user from the YAML data
    del data["credentials"]["usernames"][username]
    # Remove the user's database in ArangoDB
    root_user = os.getenv("ARANGO_ROOT_USER")
    root_password = os.getenv("ARANGO_ROOT_PASSWORD")
    base_arango = ArangoDB(user=root_user, password=root_password, db_name="base")
    # Remove the user's database in ArangoDB
    root_user = os.getenv("ARANGO_ROOT_USER")
    root_password = os.getenv("ARANGO_ROOT_PASSWORD")
    arango = ArangoDB(user=root_user, password=root_password, db_name="_system")
    if arango.db.has_database(username):
        arango.db.delete_database(username)
    # Remove user access from documents in relevant collections
    collections = ["sci_articles", "other_documents"]
    for collection_name in collections:
        documents = base_arango.db.aql.execute(
            """
            FOR doc IN @@collection_name
                FILTER @username IN doc.user_access
                RETURN {'_id': doc._id, 'user_access': doc.user_access}
            """,
            bind_vars={"username": username, "@collection_name": collection_name},
        )
        for document in documents:
            if 'user_access' in document:
                # Remove username from the list user_access
                document['user_access'].remove(username)
                base_arango.db.collection(collection_name).update(document)
    print_green(f"User {username} deleted successfully.")
 def main():
    parser = argparse.ArgumentParser(description="Add or delete a user.")
    parser.add_argument("--user", help="Username")
    parser.add_argument("--email", help="Email address")
    parser.add_argument("--name", help="Full name")
    parser.add_argument("--password", help="Password")
    parser.add_argument("--delete", action="store_true", help="Delete user")
    args = parser.parse_args()
    yaml_file = "streamlit_users.yaml"
    data = read_yaml(yaml_file)
    if args.delete:
        if args.user:
            username = args.user
            delete_user(data, username)
            write_yaml(yaml_file, data)
        else:
            print("Error: Username is required to delete a user.")
            sys.exit(1)
    else:
        if args.user and args.email and args.name:
            username = args.user
            email = args.email
            name = args.name
            if args.password and len(args.password) >= 8:
                password = args.password
            else:
                password = generate_random_password()
                print_yellow("Generated password:", password)
        else:
            username = input("Enter username: ")
            email = input("Enter email: ")
            name = input("Enter name: ")
            password = getpass.getpass("Enter password: ")
            if not password or password == "":
                password = generate_random_password()
                print_yellow("Generated password:", password)
        if username == 'test':
            delete_user(data, username)
        email = email.lower().strip()
        checked_username = fix_key(username)
        if checked_username != username:
            username = checked_username
            print_red(f"Username '{username}' contains invalid characters.")
            print_yellow(f"Using '{checked_username}' instead.")
        add_user(data, username, email, name, password)
        make_arango(username)
        write_yaml(yaml_file, data)
        print_green(f"User {username} added successfully.")
 if __name__ == "__main__":
    main()
--- a/new_user.py
+++ b/new_user.py
@ -1,97 +0,0 @@
 import yaml
 import sys
 import bcrypt
 from _arango import ArangoDB
 import os
 import dotenv
 import getpass
 dotenv.load_dotenv()
 def read_yaml(file_path):
    with open(file_path, "r") as file:
        return yaml.safe_load(file)
 def write_yaml(file_path, data):
    with open(file_path, "w") as file:
        yaml.safe_dump(data, file)
 def add_user(data, username, email, name, password):
    # Check for existing username
    if username in data["credentials"]["usernames"]:
        print(f"Error: Username '{username}' already exists.")
        sys.exit(1)
    # Check for existing email
    for user in data["credentials"]["usernames"].values():
        if user["email"] == email:
            print(f"Error: Email '{email}' already exists.")
            sys.exit(1)
    # Hash the password using bcrypt
    hashed_password = bcrypt.hashpw(password.encode("utf-8"), bcrypt.gensalt()).decode(
        "utf-8"
    )
    # Add the new user
    data["credentials"]["usernames"][username] = {
        "email": email,
        "name": name,
        "password": hashed_password,
    }
 def make_arango(username):
    root_user = os.getenv("ARANGO_ROOT_USER")
    root_password = os.getenv("ARANGO_ROOT_PASSWORD")
    arango = ArangoDB(user=root_user, password=root_password, db_name="_system")
    if not arango.db.has_database(username):
        arango.db.create_database(
            username,
            users=[
                {
                    "username": os.getenv("ARANGO_USER"),
                    "password": os.getenv("ARANGO_PASSWORD"),
                    "active": True,
                    "extra": {},
                }
            ]
        )
    arango = ArangoDB(user=root_user, password=root_password, db_name=username)
    for collection in ["projects", "favorite_articles", "article_collections", "settings", 'chats', 'notes', 'other_documents']:
        if not arango.db.has_collection(collection):
            arango.db.create_collection(collection)
    user_arango = ArangoDB(db_name=username)
    user_arango.db.collection("settings").insert(
        {"current_page": 'Bot Chat', "current_project": None}
    )
 def main():
    yaml_file = "streamlit_users.yaml"
    if len(sys.argv) == 5:
        username = sys.argv[1]
        email = sys.argv[2]
        name = sys.argv[3]
        password = sys.argv[4]
    else:
        username = input("Enter username: ")
        email = input("Enter email: ")
        name = input("Enter name: ")
        password = getpass.getpass("Enter password: ")
    data = read_yaml(yaml_file)
    add_user(data, username, email, name, password)
    make_arango(username)
    write_yaml(yaml_file, data)
    print(f"User {username} added successfully.")
 if __name__ == "__main__":
    main()
--- a/prompts.py
+++ b/prompts.py
@ -28,7 +28,7 @@ def get_assistant_prompt():
    You should not write a reference section as this will be added later.
    Format your answers in Markdown format. """
-def get_editor_prompt(project: "Project", tools: bool = False):
+def get_editor_prompt(project: "Project"):
    """Generates a coaching prompt for an editor to assist a reporter with a specific project.
    Args:
@ -51,14 +51,32 @@ def get_editor_prompt(project: "Project", tools: bool = False):
    return f'''You are an editor coaching a journalist who is working on the project "{project.name}". {description_string(project)}
    {notes_string}
-    When writing with the reporter you will also get other information, like excerpts from articles and other documents. Use the notes to put the information in context and help the reporter to move forward.
+    When writing with the reporter you will _often_ get other information, like excerpts from articles and other documents. Use the notes to put the information in context and help the reporter to move forward.
    If no other information is provided, try to answer based on the conversation history. If there is no history, and you're requested to answer in a conversational way, don't pretent to know things you don't have information about.
    The project is a journalistic piece, so it is important that you help the reporter to be critical of the sources and to provide a balanced view of the topic.
    Be sure to understand what the reporter is asking and provide the information in a way that is helpful for the reporter to move forward. Try to understand if the reporter is asking for a specific piece of information or if they are looking for guidance on how to move forward, or just want to discuss the topic.
    If you need more information to answer the question, try to get it.
    '''
-def get_chat_prompt(user_input, content_string, role):
+def get_chat_prompt(user_input, role, content_string=None, content_attachment=None, image_attachment=False):
-    if role == "Research Assistant":
+    
    if image_attachment:
        return f'''{user_input}
        Use the attached image to write your response.
        '''
    elif content_attachment:
            return f'''{user_input} 
        Content of the attached file:
         """
         {content_attachment}
         """
         Respond to "{user_input}" based on the information in the attachment.
         Fomat your answer in a way that is easy to understand for a general audience, and in an basic Markdown format.
         '''
    elif role == "Research Assistant":
        prompt = f'''{user_input}
        Below are snippets from different articles, often with title and date of publication.
@ -151,10 +169,9 @@ def get_image_system_prompt(project: "Project"):
    return re.sub(r"\s*\n\s*", "\n", system_message)
 def get_tools_prompt(user_input):
-    return f'''The reporter has asked: "{user_input}" 
+    return f'''User message: "{user_input}" 
-    What information is needed to answer the question? Choose one or many tools in order to answer the question. Make sure to read the description of the tools carefully before choosing.
+    Choose one or many tools in order to answer the message. It's important that you think of what information (if any) is needed to make a good answer. 
-    If you are shure that you can answer the question in a correct way without fetching data, you can do that as well.
+    Make sure to read the description of the tools carefully before choosing!
    '''
--- a/rss_analyzer.py
+++ b/rss_analyzer.py
--- a/streamlit_app.py
+++ b/streamlit_app.py
@ -8,12 +8,17 @@ from time import sleep
 from colorprinter.print_color import *
 from _arango import ArangoDB
 def get_settings():
    """
    Function to get the settings from the ArangoDB.
    """
    arango = ArangoDB(db_name=st.session_state["username"])
-    st.session_state["settings"] = arango.db.collection("settings").get("settings")
+    settings = arango.db.collection("settings").get("settings")
    if settings:
        st.session_state["settings"] = settings
    else:
        st.session_state["settings"] = {'current_collection': None, 'current_page': None}
    return st.session_state["settings"]
@ -49,7 +54,14 @@ if st.session_state["authentication_status"]:
    for _ in range(3):
        try:
-            from streamlit_pages import Article_Collections, Bot_Chat, Projects, Settings
+            from streamlit_pages import (
                Article_Collections,
                Bot_Chat,
                Projects,
                Settings,
                RSS_Feeds
            )
            break
        except ImportError as e:
            # Write the full error traceback
@ -57,11 +69,11 @@ if st.session_state["authentication_status"]:
            print_red(e)
            print("Retrying to import pages...")
-    get_settings()
+    st.session_state["settings"] = get_settings()
-    if 'current_page' in st.session_state["settings"]:
+    if isinstance(st.session_state["settings"], dict) and "current_page" in st.session_state["settings"]:
        st.session_state["current_page"] = st.session_state["settings"]["current_page"]
    else:
-        if 'current_page' not in st.session_state:
+        if "current_page" not in st.session_state:
            st.session_state["current_page"] = None
    if "not_downloaded" not in st.session_state:
@ -72,10 +84,39 @@ if st.session_state["authentication_status"]:
    projects = st.Page(Projects)
    article_collections = st.Page(Article_Collections)
    settings = st.Page(Settings)
-
+    rss_feeds = st.Page(RSS_Feeds)
-
+
-    pg = st.navigation([bot_chat, projects, article_collections, settings])
+    pg = st.navigation([bot_chat, projects, article_collections, rss_feeds, settings])
-    pg.run()
+    try:
        pg.run()
    except Exception as e:
        print_red(e)
        st.error("An error occurred. The site will be reloaded.")
        import traceback
        from datetime import datetime
        from time import sleep
        traceback_string = traceback.format_exc()
        traceback.print_exc()
        arango = ArangoDB(db_name="base")
        timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
        print_rainbow(st.session_state.to_dict())
        session_state = st.session_state.to_dict()
        if 'bot' in session_state:
            del session_state['bot']
        arango.db.collection("error_logs").insert(
            {
                "error": traceback_string,
                "_key": timestamp,
                "session_state": session_state,
            },
            overwrite=True,
        )
        with st.status(":red[An error occurred. The site will be reloaded.]"):
            for i in range(5):
                sleep(1)
                st.write(f"Reloading in {5-i} seconds...")
        st.rerun()
    with st.sidebar:
        st.write("---")
        authenticator.logout()
--- a/streamlit_chatbot.py
+++ b/streamlit_chatbot.py
@ -13,10 +13,15 @@ class Chat(BaseClass):
        self.name = kwargs.get("name", None)
        self.chat_history = kwargs.get("chat_history", [])
        self.role = role
        self.message_attachments = None
    def add_message(self, role, content):
        self.chat_history.append(
-            {"role": role, "content": content.strip().strip('"'), "role_type": self.role}
+            {
                "role": role,
                "content": content.strip().strip('"'),
                "role_type": self.role,
            }
        )
    def to_dict(self):
@ -102,26 +107,27 @@ class Bot(BaseClass):
        if not self.collection and self.project:
            self.collection = self.project.collections
        if not isinstance(self.collection, list):
            self.collection = [self.collection]
        # Load articles in the collections
-        self.arango_ids  = []
+        self.arango_ids = []
        for collection in self.collection:
            for _id in self.user_arango.db.aql.execute(
-                '''
+                """
                FOR doc IN article_collections
                FILTER doc.name == @collection
                FOR article IN doc.articles
                    RETURN article._id
-                ''',
+                """,
                bind_vars={"collection": collection},
-                ):
+            ):
                self.arango_ids.append(_id)
        self.chosen_backend = kwargs.get("chosen_backend", None)
-        self.chatbot: LLM = None
+        self.chatbot: LLM = LLM()
        self.tools: list[dict] = None
        self.chatbot_memory = None
@ -138,13 +144,17 @@ class Bot(BaseClass):
        self.toolbot = LLM(
            temperature=0,
-            system_message="Choose one or many tools to use in order to assist the user. Make sure to read the description of the tools carefully.",
+            system_message="""
            You are an assistant bot helping an answering bot to answer a user's messages. 
            Your task is to choose one or multiple tools that will help the answering bot to provide the user with the best possible answer.
            Try to understand if the answering bot needs any information to answer the user's message, and if so, choose the tool that will provide that information.
            You should NEVER try to answer the user's message yourself, only choose the tool that will help the answering bot to answer the user's message.
            ** Make sure to read the description of the tools carefully! **
            You MUST choose a tool, if no additional information is needed, choose "conversational_response".""",
            chat=False,
            model="small",
        )
        # self.sidebar_content()
    def sidebar_content(self):
@ -265,27 +275,52 @@ class Bot(BaseClass):
        return grouped_chunks
-    def process_user_input(self, user_input):
+    def process_user_input(self, user_input, content_attachment=None):
        # Add user's message to chat history
        self.chat.add_message("user", user_input)
        # Generate response with tool support
-        prompt = get_tools_prompt(user_input)
+        if not content_attachment:
-        response = self.toolbot.generate(prompt, tools=self.tools, stream=False)
+            prompt = get_tools_prompt(user_input)
-        print_yellow("Tool to use")
+            response = self.toolbot.generate(prompt, tools=self.tools, stream=False)
-        # Check if the LLM wants to use a tool
+            # Check if the LLM wants to use a tool
-        if isinstance(response, dict) and "tool_calls" in response:
+            if isinstance(response, dict) and "tool_calls" in response:
-            bot_response = self.answer_tool_call(response, user_input)
+                print_yellow("Tool(s) to use:", response["tool_calls"])
                bot_response = self.answer_tool_call(
                    response, user_input=user_input
                )
            else:
                # Use the LLM's direct response
                bot_response = response.strip('"')
                with st.chat_message(
                    "assistant", avatar=self.chat.get_avatar(role="assitant")
                ):
                    st.write(bot_response)
        else:
            # Use the LLM's direct response
            bot_response = response.strip('"')
            with st.chat_message(
-                "assistant", avatar=self.chat.get_avatar(role="assitant")
+                "assistant", avatar=self.chat.get_avatar(role="assistant")
            ):
-                st.write(bot_response)
+                with st.spinner("Reading the content..."):
-
+                    if self.chat.message_attachments == 'image':
                        prompt = get_chat_prompt(
                            user_input, role=self.chat.role, image_attachment=True
                        )                        
                        print_yellow("Content attachment:", type(content_attachment))
                        bot_response = self.chatbot.generate(
                            prompt,
                            stream=False,
                            images=[content_attachment],
                            model="vision",
                        )
                        st.write(bot_response)
                    else:
                        prompt = get_chat_prompt(
                            user_input, content_attachment=content_attachment, role=self.chat.role
                        )
                        response = self.chatbot.generate(prompt, stream=True)
                        bot_response = st.write_stream(response)
        # Add assistant's message to chat history
        if self.chat.chat_history[-1]["role"] != "assistant":
            self.chat.add_message("assistant", bot_response)
@ -323,6 +358,7 @@ class Bot(BaseClass):
                ]:
                    chunks = getattr(self, function_name)(**arguments)
                    # Provide the tool's output back to the LLM
                    response = self.generate_from_chunks(user_input, chunks)
                    bot_response = st.write_stream(response)
                    bot_response = bot_response.strip('"')
@ -357,7 +393,7 @@ class Bot(BaseClass):
        for note in notes:
            notes_string += f"\n# {note['title']}\n{note['content']}\n---\n"
-        prompt = get_chat_prompt(user_input, notes_string, role=self.chat.role)
+        prompt = get_chat_prompt(user_input, content_string=notes_string, role=self.chat.role)
        with st.spinner("Reading project notes..."):
            return self.chatbot.generate(prompt, stream=True)
@ -377,7 +413,7 @@ class Bot(BaseClass):
                f"{chunks_content_string}\n---\n"
            )
-        prompt = get_chat_prompt(user_input, chunks_string, role=self.chat.role)
+        prompt = get_chat_prompt(user_input, content_string=chunks_string, role=self.chat.role)
        magazines = list(
            set(
@ -394,7 +430,7 @@ class Bot(BaseClass):
            s = "Reading articles..."
        with st.spinner(s):
            return (
-                self.chatbot.generate(prompt, stream=True)
+                self.chatbot.generate(prompt, user_input=user_input, context=chunks_string, stream=True)
                if self.chatbot
                else self.llm.generate(prompt, stream=True)
            )
@ -405,8 +441,43 @@ class Bot(BaseClass):
        # Display chat history
        self.chat.show_chat_history()
        self.attachment = 'image'
        if user_input := st.chat_input("Write your message here...", accept_file=True):
            user_input.text = user_input.text.replace('"""', '---')
            if len(user_input.files) > 1:
                st.error("Please upload only one file at a time.")
            if user_input.files:
                print(user_input.files)
                attached_file = user_input.files[0]
                if attached_file.type == "application/pdf":
                    # Read the PDF content
                    pdf_content = attached_file.read()
                    # Open the PDF with PyMuPDF
                    import fitz
                    pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
                    content_attachment = ""
                    for page_num in range(len(pdf_document)):
                        page = pdf_document.load_page(page_num)
                        content_attachment += page.get_text()
                    print_green("PDF text:", content_attachment)
                elif (
                    attached_file.type == "image/png"
                    or attached_file.type == "image/jpeg"
                ):
                    self.chat.message_attachments = 'image'
                    content_attachment = attached_file.read()
                    with st.chat_message(
                        "user", avatar=self.chat.get_avatar(role="user")
                    ):
                        st.image(content_attachment)
            else:
                content_attachment = None
            user_input = user_input.text
        if user_input := st.chat_input("Write your message here..."):
            with st.chat_message("user", avatar=self.chat.get_avatar(role="user")):
                st.write(user_input)
            if not self.chat.name:
@ -428,7 +499,7 @@ class Bot(BaseClass):
                        }
                    )
                    self.chat_key = chat_doc["_key"]
-            self.process_user_input(user_input)
+            self.process_user_input(user_input, content_attachment)
            self.update_session_state()
    def get_notes(self):
@ -453,6 +524,11 @@ class Bot(BaseClass):
        },
    )
    def fetch_science_articles(self, query: str, n_documents: int):
        n_documents = int(n_documents)
        if n_documents < 3:
            n_documents = 3
        elif n_documents > 10:
            n_documents = 10
        return self.get_chunks(
            query, collections=["sci_articles"], n_results=n_documents
        )
@ -472,8 +548,13 @@ class Bot(BaseClass):
        },
    )
    def fetch_other_documents(self, query: str, n_documents: int):
        n_documents = int(n_documents)
        if n_documents < 2:
            n_documents = 2
        elif n_documents > 10:
            n_documents = 10
        return self.get_chunks(
-            query, collections=["other_documents"], n_results=n_documents
+            query, collections=[f"{self.username}__other_documents"], n_results=n_documents
        )
    @ToolRegistry.register(
@ -491,25 +572,35 @@ class Bot(BaseClass):
        },
    )
    def fetch_science_articles_and_other_documents(self, query: str, n_documents: int):
        n_documents = int(n_documents)
        if n_documents < 3:
            n_documents = 3
        elif n_documents > 10:
            n_documents = 10
        return self.get_chunks(
            query,
-            collections=["sci_articles", "other_documents"],
+            collections=["sci_articles", f"{self.username}__other_documents"],
            n_results=n_documents,
        )
    @ToolRegistry.register(
        name="fetch_notes",
-        description="Fetches information from the project notes when you as an editor need context from the project notes to understand other information. ONLY use this together with other tools!",
+        description="Fetches information from the project notes when you as an editor need context from the project notes to understand other information. ONLY use this together with other tools! No arguments needed.",
    )
    def fetch_notes(self):
        return self.get_notes()
    @ToolRegistry.register(
        name="conversational_response",
-        description="Generates a conversational response without fetching data. Use this ONLY if it is obvious that the user is not looking for information but only wants to chat.",
+        description="Let the answering bot write a response without fetching data. Use this ONLY if it is obvious that the user is not looking for information but only wants to smalltalk (like saying 'hi'). No arguments or needed.",
    )
    def conversational_response(self, query: str):
-        query = f'User message: "{query}". Make your answer short and conversational. Include a very brief description of the project if you think that would be helpful.'
+        query = f"""
        User message: "{query}". 
        Make your answer short and conversational. 
        This is perhaps not a conversation about a journalistic project, so don't try to be too informative.
        Don't answer with anything you're not sure of! 
        """
        result = (
            self.chatbot.generate(query, stream=True)
            if self.chatbot
@ -550,7 +641,6 @@ class ResearchAssistantBot(Bot):
        self.tools = ToolRegistry.get_tools(
            tools=[
                "fetch_science_articles",
                "fetch_other_documents",
                "fetch_science_articles_and_other_documents",
            ]
        )
@ -572,7 +662,11 @@ class PodBot(Bot):
        self.instructions = instructions
        self.guest_name = kwargs.get("name_guest", "Merit")
        self.hostbot = HostBot(
-            Chat(username=self.username, role="Host"), subject, username, instructions=instructions, **kwargs
+            Chat(username=self.username, role="Host"),
            subject,
            username,
            instructions=instructions,
            **kwargs,
        )
        self.guestbot = GuestBot(
            Chat(username=self.username, role="Guest"),
@ -607,12 +701,11 @@ class PodBot(Bot):
        Say hello to the expert and start the interview. Remember to keep the interview to the subject of {self.subject} throughout the conversation.
        '''
        # Stop button for the podcast
        with st.sidebar:
-            stop = st.button("Stop the podcast")
+            stop = st.button("Stop podcast", on_click=self.stop_podcast)
            if stop:
                st.session_state["make_podcast"] = False
        while st.session_state["make_podcast"]:
        while st.session_state["make_podcast"]:
            # Stop the podcast if there are more than 14 messages in the chat
            self.chat.show_chat_history()
            if len(self.chat.chat_history) == 14:
@ -633,8 +726,7 @@ class PodBot(Bot):
                stream=False,
            )
            if "tool_calls" in _q:
-                print_yellow("Tool call response (host)", _q)
+                print_yellow("Tool call response (host)", _q['tool_calls'])
                print_purple("HOST", self.hostbot.chat.role)
                q = self.hostbot.answer_tool_call(_q, a)
            else:
                q = _q
@ -653,26 +745,33 @@ class PodBot(Bot):
                a = _a
            self.chat.add_message("Guest", a)
            self.update_session_state()
    def stop_podcast(self):
        st.session_state["make_podcast"] = False
        self.update_session_state()
        print_rainbow(st.session_state.to_dict())
        self.chat.show_chat_history()
 class HostBot(Bot):
-    def __init__(self, chat: Chat, subject: str, username: str, instructions: str, **kwargs):
+    def __init__(
        self, chat: Chat, subject: str, username: str, instructions: str, **kwargs
    ):
        super().__init__(chat=chat, username=username, **kwargs)
        self.chat.role = kwargs.get("role", "Host")
        self.tools = ToolRegistry.get_tools(
            tools=[
                "fetch_notes",
                "conversational_response",
-                "fetch_other_documents",
+                #"fetch_other_documents", #TODO Should this be included?
            ]
        )
        self.instructions = instructions
        self.llm = LLM(
            system_message=f'''
-            You are the host of a podcast and an expert on {subject}. You will ask one question at a time about the subject, and then wait for the answer. 
+            You are the host of a podcast and an expert on {subject}. You will ask one question at a time about the subject, and then wait for the guest to answer. 
            Don't ask the guest to talk about herself/himself, only about the subject.
            Make your questions short and clear, only if necessary add a brief context to the question.
            These are the instructions for the podcast from the producer:
            """
            {self.instructions}
@ -682,11 +781,11 @@ class HostBot(Bot):
        )
        self.toolbot = LLM(
            temperature=0,
-            system_message='''
+            system_message="""
            You are assisting a podcast host in asking questions to an expert. 
            Choose one or many tools to use in order to assist the host in asking relevant questions. 
-            Often "conversational_response" is enough, but sometimes notes are needed or even other documents. 
+            Often "conversational_response" is enough, but sometimes project notes are needed. 
-            Make sure to read the description of the tools carefully!''',
+            Make sure to read the description of the tools carefully!""",
            chat=False,
            model="small",
        )
@ -709,6 +808,7 @@ class GuestBot(Bot):
            system_message=f"""
            You are {kwargs.get('name', 'Merit')}, an expert on {subject}. 
            Today you are a guest in a podcast about {subject}. A host will ask you questions about the subject and you will answer by using scientific facts and information.
            When answering, don't say things like "based on the documents" or alike, as neither the host nor the audience can see the documents. Act just as if you were talking to someone in a conversation.
            Try to be concise when answering, and remember that the audience of the podcast is not expert on the subject, so don't complicate things too much.
            It's very important that you answer in a "spoken" way, as if you were talking to someone in a conversation. That means you should avoid using scientific jargon and complex terms, too many figures or abstract concepts. 
            Lists are also not recommended, instead use "for the first reason", "secondly", etc.
--- a/streamlit_pages.py
+++ b/streamlit_pages.py
@ -18,7 +18,7 @@ def Bot_Chat():
    Function to handle the Chat Bot page.
    """
    from _classes import BotChatPage
-    if 'bot_chat_page' not in st.session_state:
+    if 'Bot Chat' not in st.session_state:
        st.session_state['Bot Chat'] = {}
    chatpage = BotChatPage(username=st.session_state["username"])
    chatpage.run()
@ -28,7 +28,7 @@ def Article_Collections():
    Function to handle the Article Collections page.
    """
    from _classes import ArticleCollectionsPage
-    if 'article_collections' not in st.session_state:
+    if 'Article Collections' not in st.session_state:
        st.session_state['Article Collections'] = {}
    article_collection = ArticleCollectionsPage(username=st.session_state["username"])
@ -42,3 +42,15 @@ def Settings():
    from _classes import SettingsPage
    settings = SettingsPage(username=st.session_state["username"])
    settings.run()
 def RSS_Feeds():
    """
    Function to handle the RSS Feeds page.
    """
    from _classes import RSSFeedsPage
    if 'RSS Feeds' not in st.session_state:
        st.session_state['RSS Feeds'] = {}
    rss_feeds_page = RSSFeedsPage(username=st.session_state["username"])
    rss_feeds_page.run()
--- a/streamlit_rss_old.py
+++ b/streamlit_rss_old.py
@ -0,0 +1,345 @@
 import os
 import urllib
 import streamlit as st
 from _base_class import BaseClass
 import feedparser
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 from utils import fix_key
 from colorprinter.print_color import *
 from datetime import datetime, timedelta
 class RSSFeedsPage(BaseClass):
    def __init__(self, username: str):
        super().__init__(username=username)
        self.page_name = "RSS Feeds"
        # Initialize attributes from session state if available
        for k, v in st.session_state.get(self.page_name, {}).items():
            setattr(self, k, v)
    def run(self):
        if "selected_feed" not in st.session_state:
            st.session_state["selected_feed"] = None
        self.update_current_page(self.page_name)
        self.display_feed()
        self.sidebar_actions()
        # Persist state to session_state
        self.update_session_state(page_name=self.page_name)
    def select_rss_feeds(self):
        # Fetch RSS feeds from the user's ArangoDB collection
        rss_feeds = self.get_rss_feeds()
        if rss_feeds:
            feed_options = [feed["title"] for feed in rss_feeds]
            with st.sidebar:
                st.subheader("Show your feeds")
                selected_feed_title = st.selectbox(
                    "Select a feed", options=feed_options, index=None
                )
                if selected_feed_title:
                    st.session_state["selected_feed"] = [
                        feed["_key"]
                        for feed in rss_feeds
                        if feed["title"] == selected_feed_title
                    ][0]
                    st.rerun()
        else:
            st.write("You have no RSS feeds added.")
    def get_rss_feeds(self):
        return list(self.user_arango.db.collection("rss_feeds").all())
    def sidebar_actions(self):
        with st.sidebar:
            # Select a feed to show
            self.select_rss_feeds()
            st.subheader("Add a New RSS Feed")
            rss_url = st.text_input("Website URL or RSS Feed URL")
            if st.button("Discover Feeds"):
                if rss_url:
                    with st.spinner("Discovering feeds..."):
                        feeds = self.discover_feeds(rss_url)
                    if feeds:
                        st.session_state["discovered_feeds"] = feeds
                        st.rerun()
                    else:
                        st.error("No RSS feeds found at the provided URL.")
            if "discovered_feeds" in st.session_state:
                st.subheader("Select a Feed to Add")
                feeds = st.session_state["discovered_feeds"]
                feed_options = [f"{feed['title']} ({feed['href']})" for feed in feeds]
                selected_feed = st.selectbox("Available Feeds", options=feed_options)
                selected_feed_url = feeds[feed_options.index(selected_feed)]["href"]
                if st.button("Preview Feed"):
                    feed_data = feedparser.parse(selected_feed_url)
                    st.write(f"{feed_data.feed.get('title', 'No title')}")
                    description = html_to_markdown(
                        feed_data.feed.get("description", "No description")
                    )
                    st.write(f"_{description}_")
                    for entry in feed_data.entries[:5]:
                        print("ENTRY:")
                        with st.expander(entry.title):
                            summary = (
                                entry.summary
                                if "summary" in entry
                                else "No summary available"
                            )
                            markdown_summary = html_to_markdown(summary)
                            st.markdown(markdown_summary)
                    if st.button(
                        "Add RSS Feed",
                        on_click=self.add_rss_feed,
                        args=(selected_feed_url, feed_data, description),
                    ):
                        del st.session_state["discovered_feeds"]
                        st.success("RSS Feed added.")
                        st.rerun()
    def discover_feeds(self, url):
        try:
            if not url.startswith("http"):
                url = "https://" + url
            # Check if the input URL is already an RSS feed
            f = feedparser.parse(url)
            if len(f.entries) > 0:
                return [
                    {
                        "href": url,
                        "title": f.feed.get("title", "No title"),
                        "icon": self.get_site_icon(url),
                    }
                ]
            # If not, proceed to discover feeds from the webpage
            raw = requests.get(url).text
            result = []
            possible_feeds = []
            html = BeautifulSoup(raw, "html.parser")
            # Find the site icon
            icon_url = self.get_site_icon(url, html)
            # Find all <link> tags with rel="alternate" and type containing "rss" or "xml"
            feed_urls = html.findAll("link", rel="alternate")
            for f in feed_urls:
                t = f.get("type", None)
                if t and ("rss" in t or "xml" in t):
                    href = f.get("href", None)
                    if href:
                        possible_feeds.append(urljoin(url, href))
            # Find all <a> tags with href containing "rss", "xml", or "feed"
            parsed_url = urllib.parse.urlparse(url)
            base = parsed_url.scheme + "://" + parsed_url.hostname
            atags = html.findAll("a")
            for a in atags:
                href = a.get("href", None)
                if href and ("rss" in href or "xml" in href or "feed" in href):
                    possible_feeds.append(urljoin(base, href))
            # Validate the possible feeds using feedparser
            for feed_url in list(set(possible_feeds)):
                f = feedparser.parse(feed_url)
                if len(f.entries) > 0:
                    result.append(
                        {
                            "href": feed_url,
                            "title": f.feed.get("title", "No title"),
                            "icon": icon_url,
                        }
                    )
            return result
        except Exception as e:
            print(f"Error discovering feeds: {e}")
            return []
    def add_rss_feed(self, url, feed_data, description):
        try:
            icon_url = feed_data["feed"]["image"]["href"]
        except Exception as e:
            icon_url = self.get_site_icon(url)
        title = feed_data["feed"].get("title", "No title")
        print_blue(title)
        icon_path = download_icon(icon_url) if icon_url else None
        _key = fix_key(url)
        now_timestamp = datetime.now().isoformat()  # Convert datetime to ISO format string
        self.user_arango.db.collection("rss_feeds").insert(
            {
                "_key": _key,
                "url": url,
                "title": title,
                "icon_path": icon_path,
                "description": description,
                'fetched_timestamp': now_timestamp,  # Add the timestamp field
                'feed_data': feed_data,
            },
            overwrite=True,
        )
        feed = self.get_feed_from_arango(_key)
        now_timestamp = datetime.now().isoformat()  # Convert datetime to ISO format string
        if feed:
            self.update_feed(_key, feed)
        else:
            self.base_arango.db.collection("rss_feeds").insert(
                {
                    "_key": _key,
                    "url": url,
                    "title": title,
                    "icon_path": icon_path,
                    "description": description,
                    'fetched_timestamp': now_timestamp,  # Add the timestamp field
                    "feed_data": feed_data,
                },
                overwrite=True,
                overwrite_mode="update",
            )
    def update_feed(self, feed_key, feed=None):
        """
        Updates RSS feed that already exists in the ArangoDB base database.
        Args:
            feed_key (str): The key identifying the feed in the database.
        Returns:
            dict: The parsed feed data.
        Raises:
            Exception: If there is an error updating the feed in the database.
        """
        if not feed:
            feed = self.get_feed_from_arango(feed_key)
        feed_data = feedparser.parse(feed["url"])
        print_rainbow(feed_data['feed'])
        feed["feed_data"] = feed_data
        if self.username not in feed.get("users", []):
            feed["users"] = feed.get("users", []) + [self.username]
        fetched_timestamp = datetime.now().isoformat()  # Convert datetime to ISO format string
        # Update the fetched_timestamp in the database
        self.base_arango.db.collection("rss_feeds").update(
            {
                "_key": feed["_key"],
                "fetched_timestamp": fetched_timestamp,
                "feed_data": feed_data,
            }
        )
        return feed_data
    def update_session_state(self, page_name=None):
        # Update session state
        if page_name:
            st.session_state[page_name] = self.__dict__
    def get_site_icon(self, url, html=None):
        try:
            if not html:
                raw = requests.get(url).text
                html = BeautifulSoup(raw, "html.parser")
            icon_link = html.find("link", rel="icon")
            if icon_link:
                icon_url = icon_link.get("href", None)
                if icon_url:
                    return urljoin(url, icon_url)
            # Fallback to finding other common icon links
            icon_link = html.find("link", rel="shortcut icon")
            if icon_link:
                icon_url = icon_link.get("href", None)
                if icon_url:
                    return urljoin(url, icon_url)
            return None
        except Exception as e:
            print(f"Error getting site icon: {e}")
            return None
    def get_feed_from_arango(self, feed_key):
        """
        Retrieve an RSS feed from the ArangoDB base databse.
        Args:
            feed_key (str): The key of the RSS feed to retrieve from the ArangoDB base database.
        Returns:
            dict: The RSS feed document retrieved from the ArangoDB base database.
        """
        return self.base_arango.db.collection("rss_feeds").get(feed_key)
    def get_feed(self, feed_key):
        feed = self.get_feed_from_arango(feed_key)
        feed_data = feed["feed_data"]
        fetched_time = datetime.fromisoformat(feed['fetched_timestamp'])  # Parse the timestamp string
        if datetime.now() - fetched_time < timedelta(hours=1):
            return feed_data
        else:
            return self.update_feed(feed_key)
    def display_feed(self):
        if st.session_state["selected_feed"]:
            feed_data = self.get_feed(st.session_state["selected_feed"])
            st.title(feed_data['feed'].get("title", "No title"))
            st.write(feed_data['feed'].get("description", "No description"))
            st.write("**Recent Entries:**")
            for entry in feed_data['entries'][:5]:
                with st.expander(entry['title']):
                    summary = (
                        entry['summary'] if "summary" in entry else "No summary available"
                    )
                    markdown_summary = html_to_markdown(summary)
                    st.markdown(markdown_summary)
                    st.markdown(f"[Read more]({entry['link']})")
 def html_to_markdown(html):
    soup = BeautifulSoup(html, "html.parser")
    for br in soup.find_all("br"):
        br.replace_with("\n")
    for strong in soup.find_all("strong"):
        strong.replace_with(f"**{strong.text}**")
    for em in soup.find_all("em"):
        em.replace_with(f"*{em.text}*")
    for p in soup.find_all("p"):
        p.replace_with(f"{p.text}\n\n")
    return soup.get_text()
 def download_icon(icon_url, save_folder="external_icons"):
    try:
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)
        response = requests.get(icon_url, stream=True)
        if response.status_code == 200:
            icon_name = os.path.basename(icon_url)
            icon_path = os.path.join(save_folder, icon_name)
            with open(icon_path, "wb") as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
            return icon_path
        else:
            print(f"Failed to download icon: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error downloading icon: {e}")
        return None
--- a/test_highlight.py
+++ b/test_highlight.py
@ -0,0 +1,91 @@
 import asyncio
 import re
 from pdf_highlighter import Highlighter
 from _chromadb import ChromaDB
 from _llm import LLM
 import ollama
 from colorprinter.print_color import *
 from concurrent.futures import ThreadPoolExecutor
 # Wrap the synchronous generate method
 async def async_generate(llm, prompt):
    loop = asyncio.get_event_loop()
    with ThreadPoolExecutor() as pool:
        return await loop.run_in_executor(pool, llm.generate, prompt)
 # Define the main asynchronous function to highlight the PDFs
 async def highlight_pdf(data):
    # Use the highlight method to highlight the relevant sentences in the PDFs
    highlighted_pdf_buffer = await highlighter.highlight(
        data=data, zero_indexed_pages=True  # Pages are zero-based (e.g., 0, 1, 2, ...)
    )
    # Save the highlighted PDF to a new file
    with open("highlighted_combined_documents.pdf", "wb") as f:
        f.write(highlighted_pdf_buffer.getbuffer())
        print_green("PDF highlighting completed successfully!")
 # Initialize ChromaDB client
 chromadb = ChromaDB()
 # Define the query to fetch relevant text snippets and metadata from ChromaDB
 query = "How are climate researchers advocating for change in the society?"
 # Perform the query on ChromaDB
 result = chromadb.query(query, collection="sci_articles", n_results=5)
 # Use zip to combine the lists into a list of dictionaries
 results = [
    {"id": id_, "metadata": metadata, "document": document, "distance": distance}
    for id_, metadata, document, distance in zip(
        result["ids"][0],
        result["metadatas"][0],
        result["documents"][0],
        result["distances"][0],
    )
 ]
 for r in results:
    print_rainbow(r["metadata"])
    print_yellow(type(r["metadata"]['pages']))
 # Ask a LLM a question about the text snippets
 llm = LLM(model="small")
 documents_string = "\n\n---\n\n".join(result["documents"][0])
 answer = llm.generate(
    f'''{query} Write your answer from the information below?\n\n"""{documents_string}"""\n\n{query}'''
 )
 print_green(answer)
 # Now you want to highlight relevant information in the PDFs to understand what the LLM is using!
 # Each result from ChromaDB contains the PDF filename and the pages where the text is found
 data = []
 for result in results:
    pages = result["metadata"].get("pages")
    try:
        pages = [int(pages)]
    except:
        # Use re to extraxt the page numbers separated by commas
        pages = list(map(int, re.findall(r"\d+", pages)))
    data.append(
        {
            "user_input": query,
            "pdf_filename": result["metadata"]["_id"],
            "pages": pages,
            'chunk': result['document']
        }
    )
 # Initialize the Highlighter
 highlighter = Highlighter(
    llm=llm,  # Pass the LLM to the Highlighter
    comment=False,  # Enable comments to understand the context
    use_llm=False
 )
 # Run the main function using asyncio
 asyncio.run(highlight_pdf(data))
--- a/test_ollama_client.py
+++ b/test_ollama_client.py
@ -0,0 +1,32 @@
 import os
 import base64
 from ollama import Client
 import env_manager
 from colorprinter.print_color import *
 env_manager.set_env()
 # Encode the credentials
 credentials = f"{os.getenv('LLM_API_USER')}:{os.getenv('LLM_API_PWD_LASSE')}"
 encoded_credentials = base64.b64encode(credentials.encode()).decode()
 # Set up the headers with authentication details
 headers = {
    'Authorization': f'Basic {encoded_credentials}'
 }
 # Get the host URL (base URL only)
 host_url = os.getenv("LLM_API_URL").rstrip('/api/chat/')
 # Initialize the client with the host and headers
 client = Client(
    host=host_url,
    headers=headers
 )
 # Example usage of the client
 try:
    response = client.chat(model=os.getenv('LLM_MODEL') , messages=[{'role': 'user', 'content': 'Why is the sky blue?'}])
    print_rainbow(response)
 except Exception as e:
    print(f"Error: {e}")