Add initial implementation of RSS analyzer and ArangoDB integration

main
lasseedfast 1 year ago
parent 01df43bba2
commit 00fd42b32d
  1. 21
      .env
  2. 1
      __init__.py
  3. 3
      _base_class.py
  4. 26
      _chromadb.py
  5. 119
      _classes.py
  6. 181
      _llm.py
  7. 260
      _rss.py
  8. 6
      arango_admin.py
  9. 37
      article2db.py
  10. 192
      manage_users.py
  11. 97
      new_user.py
  12. 33
      prompts.py
  13. 0
      rss_analyzer.py
  14. 63
      streamlit_app.py
  15. 200
      streamlit_chatbot.py
  16. 16
      streamlit_pages.py
  17. 345
      streamlit_rss_old.py
  18. 91
      test_highlight.py
  19. 32
      test_ollama_client.py

21
.env

@ -0,0 +1,21 @@
# Chroma
CHROMA_CLIENT_AUTH_CREDENTIALS="overpass-alms-porker-file-seigneur-kiln"
CHROMA_SERVER_AUTHN_PROVIDER="chromadb.auth.basic_authn.BasicAuthenticationServerProvider"
CHROMA_AUTH_TOKEN_TRANSPORT_HEADER="X-Chroma-Token"
CHROMA_HOST="http://192.168.1.10:8007"
CHROMA_CLIENT_AUTH_CREDENTIALS="overpass-alms-porker-file-seigneur-kiln"
CHROMA_SERVER_AUTHN_PROVIDER="chromadb.auth.basic_authn.BasicAuthenticationServerProvider"
CHROMA_AUTH_TOKEN_TRANSPORT_HEADER="X-Chroma-Token"
_CHROMA_HOST="https://lasseedfast.se/chroma_ev_cars/"
# Arango
ARANGO_HOST="http://192.168.1.10:8531"
ARANGO_USER="admin"
ARANGO_PASSWORD="raHzaw-5vyjqo-xisfec"
ARANGO_DB="base"
ARANGO_PWD_ENV_MANAGER="jagskoterenv(Y)"
ARANGO_ROOT_USER='root'
ARANGO_ROOT_PASSWORD='gyhqed-kiwNac-9buhme'
MAILERSEND_API_KEY="mlsn.71de3eb2dbcb733bd4ee509d1c95ccfc8939fd647cba9e3a0f631f60f900bd85"

@ -0,0 +1 @@
from pdf_highlighter import Highlighter

@ -13,6 +13,7 @@ class BaseClass:
self.project_name: str = kwargs.get('project_name', None)
self.collection: str = kwargs.get('collection_name', None)
self.user_arango: ArangoDB = self.get_arango()
self.base_arango: ArangoDB = self.get_arango(admin=True)
def get_arango(self, admin: bool = False, db_name: str = None) -> ArangoDB:
@ -128,7 +129,7 @@ class BaseClass:
self.file_path = f"sci_articles/{self.doi}.pdf".replace("/", "_")
return os.path.exists(self.file_path)
else:
file_path = f"{self.download_folder}/{filename}"
file_path = f"{download_folder}/{filename}"
while os.path.exists(file_path + ".pdf"):
if not re.search(r"(_\d+)$", file_path):
file_path += "_1"

@ -3,7 +3,6 @@ import os
from chromadb.config import Settings
from dotenv import load_dotenv
from colorprinter.print_color import *
load_dotenv(".env")
@ -96,12 +95,14 @@ class ChromaDB:
if k not in r["included"]:
continue
result[k][0] = v[0][: n_results - (n_sources - len(sources))]
if "_id" in where:
if where and "_id" in where:
where["_id"]["$in"] = [
i for i in where["_id"]["$in"] if i not in sources
]
if where["_id"]["$in"] == []:
break
else:
break
return result
@ -109,7 +110,8 @@ if __name__ == "__main__":
from colorprinter.print_color import *
chroma = ChromaDB()
print(chroma.db.list_collections())
exit()
result = chroma.query(
query="What is Open Science)",
collection="sci_articles",
@ -117,19 +119,5 @@ if __name__ == "__main__":
n_sources=3,
max_retries=4,
)
print(result)
exit()
all = chroma_collection.get()
ids = all.get("ids", [])
metadatas = all.get("metadatas", [])
combined_list = list(zip(ids, metadatas))
ids = []
metadatas = []
for id, metadata in combined_list:
ids.append(id)
metadata["_id"] = f"sci_articles/{metadata['_key']}"
metadatas.append(metadata)
chroma_collection.update(ids=ids, metadatas=metadatas)
print_rainbow(result['metadatas'][0])

@ -1,22 +1,28 @@
# streamlit_pages.py
import os
import feedparser
import urllib
from urllib.parse import urljoin
import requests
import re
from bs4 import BeautifulSoup
import streamlit as st
from time import sleep
import pandas as pd
from datetime import datetime, timedelta
from PIL import Image
from io import BytesIO
import base64
from colorprinter.print_color import *
from article2db import PDFProcessor
from streamlit_chatbot import Chat, EditorBot, ResearchAssistantBot, PodBot
import feedparser
from streamlit_chatbot import Chat, EditorBot, ResearchAssistantBot, PodBot, Bot
from info import country_emojis
from utils import fix_key
from _arango import ArangoDB
from _llm import LLM
from _base_class import BaseClass
from _rss import RSSReader
from prompts import get_note_summary_prompt, get_image_system_prompt
@ -399,6 +405,12 @@ class BotChatPage(BaseClass):
"chat": self.chat,
"role": self.role,
}
else:
bot = Bot(
username=self.username,
chat=Chat(username=self.username, role="Research Assistant"),
)
bot.run()
def sidebar_actions(self):
with st.sidebar:
@ -680,7 +692,6 @@ class Project(BaseClass):
def load_project(self):
print_blue("Project name:", self.name)
print(self.user_arango, type(self.user_arango))
project_cursor = self.user_arango.db.aql.execute(
"FOR doc IN projects FILTER doc.name == @name RETURN doc",
bind_vars={"name": self.name},
@ -927,3 +938,103 @@ class SettingsPage(BaseClass):
self.update_settings("avatar", img_path)
st.success("Profile picture uploaded")
sleep(1)
class RSSFeedsPage(BaseClass):
def __init__(self, username: str):
super().__init__(username=username)
self.page_name = "RSS Feeds"
self.reader = RSSReader(username=username)
# Initialize attributes from session state if available
for k, v in st.session_state.get(self.page_name, {}).items():
setattr(self, k, v)
def run(self):
if "selected_feed" not in st.session_state:
st.session_state["selected_feed"] = None
self.update_current_page(self.page_name)
self.display_feed()
self.sidebar_actions()
self.update_session_state(page_name=self.page_name)
def select_rss_feeds(self):
rss_feeds = self.reader.get_rss_feeds()
if rss_feeds:
feed_options = [feed["title"] for feed in rss_feeds]
with st.sidebar:
st.subheader("Show your feeds")
selected_feed_title = st.selectbox(
"Select a feed", options=feed_options, index=None
)
if selected_feed_title:
st.session_state["selected_feed"] = [
feed["_key"]
for feed in rss_feeds
if feed["title"] == selected_feed_title
][0]
st.rerun()
else:
st.write("You have no RSS feeds added.")
def search_feeds(self, rss_url):
with st.spinner("Discovering feeds..."):
feeds = self.reader.discover_feeds(rss_url)
if feeds:
st.session_state["discovered_feeds"] = feeds
else:
st.error("No RSS feeds found at the provided URL.")
def sidebar_actions(self):
if 'discovered_feeds' not in st.session_state:
st.session_state['discovered_feeds'] = None
with st.sidebar:
self.select_rss_feeds()
st.subheader("Add a New RSS Feed")
with st.form("add_rss_feed"):
rss_url = st.text_input("Website URL or RSS Feed URL")
submitted = st.form_submit_button("Discover Feeds")
if submitted:
print_green(rss_url)
feeds = self.reader.discover_feeds(rss_url)
st.session_state['discovered_feeds'] = feeds
if st.session_state["discovered_feeds"]:
st.subheader("Select a Feed to Add")
feeds = st.session_state["discovered_feeds"]
feed_options = [f"{feed['title']} ({feed['href']})" for feed in feeds]
selected_feed = st.selectbox("Available Feeds", options=feed_options)
selected_feed_url = feeds[feed_options.index(selected_feed)]["href"]
if st.button("Preview Feed"):
feed = self.reader.parse_feed(selected_feed_url)
st.write(f"{feed.title}")
description = self.reader.html_to_markdown(feed.description)
st.write(f"_{description}_")
for entry in feed.entries[:5]:
with st.expander(entry["title"]):
summary = entry.get("summary", "No summary available")
markdown_summary = self.reader.html_to_markdown(summary)
st.markdown(markdown_summary)
print_yellow(selected_feed_url)
if st.button(
"Add RSS Feed",
on_click=self.reader.add_rss_feed,
args=[selected_feed_url],
):
del st.session_state["discovered_feeds"]
st.success("RSS Feed added.")
st.rerun()
def display_feed(self):
if st.session_state["selected_feed"]:
self.reader.get_feed(st.session_state["selected_feed"])
st.title(self.reader.feed.title)
st.write(f"_{self.reader.feed.description}_")
for entry in self.reader.feed.entries[:5]:
with st.expander(entry["title"]):
summary = entry.get("summary", "No summary available")
markdown_summary = self.reader.html_to_markdown(summary)
st.markdown(markdown_summary)
st.markdown(f"[Read more]({entry['link']})")

@ -1,3 +1,4 @@
import re
import os
from typing import Literal, Optional
import requests
@ -5,20 +6,18 @@ from requests.auth import HTTPBasicAuth
import tiktoken
import json
from colorprinter.print_color import *
import env_manager
import re
import asyncio
import env_manager
env_manager.set_env()
tokenizer = tiktoken.get_encoding("cl100k_base")
print(os.getenv("LLM_API_USER"), os.getenv("LLM_API_PWD_LASSE"))
class LLM:
def __init__(
self,
system_message="You are an assistant.",
num_ctx=8192,
temperature=0.01,
model: Optional[Literal["small", "standard", "vision"]] = "standard",
max_length_answer=4096,
@ -31,7 +30,6 @@ class LLM:
Args:
system_message (str): The initial system message for the assistant. Defaults to "You are an assistant.".
num_ctx (int): The number of context tokens to use. Defaults to 4096.
temperature (float): The temperature setting for the model's response generation. Defaults to 0.01.
chat (bool): Flag to indicate if the assistant is in chat mode. Defaults to True.
model (str): The model type to use. Defaults to "standard". Alternatives: 'small', 'standard', 'vision'.
@ -43,7 +41,7 @@ class LLM:
"""
self.model = self.get_model(model)
self.system_message = system_message
self.options = {"temperature": temperature, "num_ctx": num_ctx}
self.options = {"temperature": temperature}
self.messages = messages or [{"role": "system", "content": self.system_message}]
self.max_length_answer = max_length_answer
self.chat = chat
@ -68,73 +66,117 @@ class LLM:
tokens = tokenizer.encode(v)
num_tokens += len(tokens)
return int(num_tokens)
def read_stream(self, response):
"""
Reads a stream of data from the given response object and yields the content of each message.
Args:
response (requests.Response): The response object to read the stream from.
Yields:
str: The content of each message in the stream.
Notes:
- The response is expected to provide data in chunks, which are decoded as UTF-8.
- Lines are split by newline characters.
- Each line is expected to be a JSON object containing a "message" key with a "content" field.
- If a chunk cannot be decoded as UTF-8, it is skipped.
- If a line cannot be parsed as JSON, it is skipped.
"""
buffer = ""
message = ""
first_chunk = True
prev_content = None # Store the previous content chunk
for chunk in response.iter_content(chunk_size=64):
if chunk:
try:
message_part = chunk.decode("utf-8")
buffer += message_part
message += message_part
except UnicodeDecodeError:
continue
while "\n" in buffer:
line, buffer = buffer.split("\n", 1)
if line:
if line.strip():
try:
json_data = json.loads(line)
yield json_data["message"]["content"]
content = json_data["message"]["content"]
done = json_data.get("done", False)
# Remove leading '"' from the first content
if first_chunk and content.startswith('"'):
content = content[1:]
first_chunk = False
if done:
# If the last content ends with '"', remove it
if prev_content and prev_content.endswith('"'):
prev_content = prev_content[:-1]
# Yield the last content
if prev_content:
yield prev_content
break
else:
# Yield the previous content before storing the current
if prev_content:
yield prev_content
prev_content = content
except json.JSONDecodeError:
continue
# Append the full message without leading/trailing quotes
self.messages.append({"role": "assistant", "content": message.strip('"')})
def make_summary(self, text):
data = {
"messages": [
{
"role": "system",
"content": """You are summarizing a text. Make it detailed and concise. Answer ONLY with the summary. Don't add any new information.""",
},
{
"role": "user",
"content": f'Summarise the text below:\n"""{text}"""\nRemember to be concise and detailed. Answer in English.',
},
],
"stream": False,
"keep_alive": 3600 * 24 * 7,
"model": self.get_model("small"),
"options": {"temperature": 0.01},
}
response = requests.post(
os.getenv("LLM_API_URL"),
json=data,
auth=HTTPBasicAuth(
os.getenv("LLM_API_USER"), os.getenv("LLM_API_PWD_LASSE")
),
)
print_blue("Summary:", response.json()["message"]["content"])
return response.json()["message"]["content"]
def generate(
self,
query,
stream=False,
tools=None,
function_call=None,
query: str = None,
user_input: str = None,
context: str = None,
stream: bool = False,
tools: list = None,
function_call: dict = None,
images: list = None,
model: Optional[Literal["small", "standard", "vision"]] = None,
temperature=None,
temperature: float = None,
):
"""
Generates a response from the language model based on the provided query and options.
Generates a response from the language model based on the provided inputs.
If user_input is provided, it is included in the message history instead of the query.
If context is provided, it is summaried if len() > 2000 and included in the message history.
Args:
query (str): The input query to be processed by the language model.
query (str, optional): The main query string to be processed by the model.
user_input (str, optional): User input to be included in the message history.
context (str, optional): Contextual information to be included in the message history.
stream (bool, optional): Whether to stream the response. Defaults to False.
tools (list, optional): A list of tools to be used by the language model. Defaults to None.
function_call (dict, optional): A dictionary specifying a function call to be made by the language model. Defaults to None.
images (list, optional): A list of image paths or base64-encoded images to be included in the request. Defaults to None.
model (str, optional): The model alias to be used for generating the response. Defaults to None. Alternatives: 'small', 'standard', 'vision'.
tools (list, optional): List of tools to be included in the request.
function_call (dict, optional): Dictionary specifying a function call to be made.
images (list, optional): List of image paths or base64-encoded images to be included.
model (Optional[Literal["small", "standard", "vision"]], optional): The model type to be used. Defaults to None.
temperature (float, optional): The temperature setting for the model. Defaults to None.
Returns:
str: The generated response from the language model. If streaming is enabled, returns the streamed response.
"""
# Add custom header if large model is chosen
model = self.get_model(model) if model else self.model
temperature = temperature if temperature else self.options["temperature"]
# Normalize whitespace and add the query to the messages
query = re.sub(r"\s*\n\s*", "\n", query)
message = {"role": "user", "content": query}
headers = {"Content-Type": "application/json"}
@ -158,6 +200,11 @@ class LLM:
base64_images.append(
base64.b64encode(image_file.read()).decode("utf-8")
)
elif isinstance(image, bytes):
base64_images.append(base64.b64encode(image).decode("utf-8"))
else:
print_red("Invalid image type")
message["images"] = base64_images
# Set the Content-Type header based on the presence of images
headers = {"Content-Type": "application/json; images"}
@ -165,18 +212,15 @@ class LLM:
# Set the model type to the vision model
if self.chosen_backend:
headers["X-Chosen-Backend"] = self.chosen_backend
self.messages.append(message)
# Set the number of tokens to be the sum of the tokens in the messages and half of the max length of the answer
if self.chat or len(self.messages) > 15000:
num_tokens = self.count_tokens() + self.max_length_answer / 2
if num_tokens < 8000 and "num_ctx" in self.options:
del self.options["num_ctx"]
else:
if num_tokens > 8000:
model = self.get_model("large")
headers["X-Model-Type"] = "standard_64k"
headers["X-Model-Type"] = "large"
if tools:
stream = False
@ -197,7 +241,9 @@ class LLM:
if function_call:
data["function_call"] = function_call
if data['model'] == 'small':
headers["X-Model-Type"] = "small"
response = requests.post(
os.getenv("LLM_API_URL"),
headers=headers,
@ -209,7 +255,21 @@ class LLM:
timeout=3600,
)
self.chosen_backend = response.headers.get('X-Chosen-Backend')
# If user_input is provided, change the last message to user_input and a summary of the context (if provided)
# This needs to be done after the request to LLM for the LLM to have the original message
if user_input:
if context:
if len(context) > 2000:
context = self.make_summary(context)
user_input = f'''{user_input}\n\nUse the information below to answer the question.\n"""{context}"""\n[This is a summary of the context provided in the original message.]'''
system_message_info = "\nSometimes some of the messages in the chat history are summarised, then that is clearly indicated in the message."
if system_message_info not in self.messages[0]["content"]:
self.messages[0]["content"] = (
self.messages[0]["content"] + system_message_info
)
self.messages[-1] = {"role": "user", "content": user_input}
self.chosen_backend = response.headers.get("X-Chosen-Backend")
if response.status_code != 200:
print_red("Error!")
@ -233,7 +293,9 @@ class LLM:
result = response_json["message"]
else:
result = response_json["message"]["content"].strip('"')
self.messages.append({"role": "assistant", "content": result.strip('"')})
self.messages.append(
{"role": "assistant", "content": result.strip('"')}
)
except requests.exceptions.JSONDecodeError:
print_red("Error: ", response.status_code, response.text)
return "An error occurred."
@ -242,6 +304,33 @@ class LLM:
self.messages = [self.messages[0]]
return result
async def async_generate(
self,
query: str = None,
user_input: str = None,
context: str = None,
stream: bool = False,
tools: list = None,
function_call: dict = None,
images: list = None,
model: Optional[Literal["small", "standard", "vision"]] = None,
temperature: float = None,
):
loop = asyncio.get_event_loop()
return await loop.run_in_executor(
None,
self.generate,
query,
user_input,
context,
stream,
tools,
function_call,
images,
model,
temperature,
)
if __name__ == "__main__":
llm = LLM()

@ -0,0 +1,260 @@
# rss_reader.py
import feedparser
import requests
import urllib
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime, timedelta
from utils import fix_key
import os
from _base_class import BaseClass
from _llm import LLM
from colorprinter.print_color import *
class RSSFeed:
def __init__(self):
self.url = None
self.title = None
self.icon_path = None
self.description = None
self.feed_data = None
self.fetched_timestamp = None
self.entries = []
class RSSReader(BaseClass):
def __init__(self, username):
super().__init__(username=username)
self.username = username
self.user_arango = self.get_arango(username)
self.feed: RSSFeed = None
self.arango_feed = None
def discover_feeds(self, url):
try:
if not url.startswith("http"):
url = "https://" + url
# Check if the input URL is already an RSS feed
f = feedparser.parse(url)
if len(f.entries) > 0:
return [
{
"href": url,
"title": f.feed.get("title", "No title"),
"icon": self.get_site_icon(url),
}
]
# If not, proceed to discover feeds from the webpage
raw = requests.get(url).text
result = []
possible_feeds = []
html = BeautifulSoup(raw, "html.parser")
# Find the site icon
icon_url = self.get_site_icon(url, html)
# Find all <link> tags with rel="alternate" and type containing "rss" or "xml"
feed_urls = html.findAll("link", rel="alternate")
for f in feed_urls:
t = f.get("type", None)
if t and ("rss" in t or "xml" in t):
href = f.get("href", None)
if href:
possible_feeds.append(urljoin(url, href))
# Find all <a> tags with href containing "rss", "xml", or "feed"
parsed_url = urllib.parse.urlparse(url)
base = parsed_url.scheme + "://" + parsed_url.hostname
atags = html.findAll("a")
for a in atags:
href = a.get("href", None)
if href and ("rss" in href or "xml" in href or "feed" in href):
possible_feeds.append(urljoin(base, href))
# Validate the possible feeds using feedparser
for feed_url in list(set(possible_feeds)):
f = feedparser.parse(feed_url)
if len(f.entries) > 0:
result.append(
{
"href": feed_url,
"title": f.feed.get("title", "No title"),
"icon": icon_url,
}
)
return result
except Exception as e:
print(f"Error discovering feeds: {e}")
return []
def add_rss_feed(self, url):
self.get_feed(url)
self.load_feed_from_url(url=url)
self.feed._key = fix_key(self.feed.url)
# Store feed data in base_arango's rss_feeds collection
self.base_arango.db.collection("rss_feeds").insert(self.feed.__dict__)
# Store a reference to the feed in user_arango's user_feeds collection
self.user_arango.db.collection("user_feeds").insert(
{
"_key": self.feed._key, # Use the same key to reference the feed
"feed_key": self.feed._key,
"subscribed_on": datetime.now().isoformat(),
# Add additional user-specific fields here
},
overwrite=True,
)
def load_feed_from_url(self, url=None, data=None):
if url:
self.feed = RSSFeed()
self.feed.url = url
full_feed_data = feedparser.parse(url)
elif data:
self.feed = RSSFeed()
self.feed.url = data.get("url", None)
full_feed_data = data
else:
full_feed_data = feedparser.parse(self.feed.url)
self.feed.title = full_feed_data["feed"].get("title", "No title")
self.feed.description = full_feed_data["feed"].get(
"description", "No description"
)
self.feed.icon_path = self.get_site_icon(self.feed.url)
self.feed.entries = []
for entry in full_feed_data["entries"]:
self.feed.entries.append(
{
"title": entry.get("title", "No title"),
"link": entry.get("link"),
"published": entry.get("published"),
"summary": self.html_to_markdown(
entry.get("summary", "No summary")
),
"id": entry.get("id"),
"author": entry.get("author"),
}
)
self.feed.fetched_timestamp = datetime.now().isoformat()
def feed_data2feed(self, data):
self.load_feed_from_url(data=data)
def parse_feed(self, url):
self.load_feed_from_url(url=url)
return self.feed
def update_feed(self):
self.load_feed_from_url()
# Update the feed in the database
self.user_arango.db.collection("rss_feeds").update(
{
"_key": self.feed._key,
"fetched_timestamp": self.feed.fetched_timestamp,
"entries": self.feed.entries,
}
)
return self.feed.entries
def get_feed(self, feed_key=None, url=None, _id=None):
if feed_key:
arango_doc = self.base_arango.db.collection("rss_feeds").get(feed_key)
elif url:
arango_doc = self.base_arango.db.aql.execute(
f"FOR doc IN rss_feeds FILTER doc.url == '{url}' LIMIT 1 RETURN doc", count=True).next()
elif _id:
arango_doc = self.base_arango.db.aql.execute(
f"FOR doc IN rss_feeds FILTER doc.id == '{_id}' LIMIT 1 RETURN doc", count=True).next()
if arango_doc:
self.feed = RSSFeed()
for attr in arango_doc:
setattr(self.feed, attr, arango_doc[attr])
fetched_time = datetime.fromisoformat(self.feed.fetched_timestamp)
if datetime.now() - fetched_time < timedelta(hours=1):
return self.feed.entries
else:
return self.update_feed()
def get_site_icon(self, url, html=None):
try:
if not html:
raw = requests.get(url).text
html = BeautifulSoup(raw, "html.parser")
icon_link = html.find("link", rel="icon")
if icon_link:
icon_url = icon_link.get("href", None)
if icon_url:
return urljoin(url, icon_url)
# Fallback to finding other common icon links
icon_link = html.find("link", rel="shortcut icon")
if icon_link:
icon_url = icon_link.get("href", None)
if icon_url:
return urljoin(url, icon_url)
return None
except Exception as e:
print(f"Error getting site icon: {e}")
return None
def get_rss_feeds(self):
return list(self.user_arango.db.collection("rss_feeds").all())
def download_icon(self, icon_url, save_folder="external_icons"):
try:
if not os.path.exists(save_folder):
os.makedirs(save_folder)
response = requests.get(icon_url, stream=True)
if response.status_code == 200:
icon_name = os.path.basename(icon_url)
icon_path = os.path.join(save_folder, icon_name)
with open(icon_path, "wb") as f:
for chunk in response.iter_content(1024):
f.write(chunk)
return icon_path
else:
print(f"Failed to download icon: {response.status_code}")
return None
except Exception as e:
print(f"Error downloading icon: {e}")
return None
def html_to_markdown(self, html):
soup = BeautifulSoup(html, "html.parser")
for br in soup.find_all("br"):
br.replace_with("\n")
for strong in soup.find_all("strong"):
strong.replace_with(f"**{strong.text}**")
for em in soup.find_all("em"):
em.replace_with(f"*{em.text}*")
for p in soup.find_all("p"):
p.replace_with(f"{p.text}\n\n")
return soup.get_text()
def get_full_content(self, url):
result = requests.get(url)
soup = BeautifulSoup(result.content, "html.parser")
class RSSAnalyzer(BaseClass):
def init(self, username):
super().__init__(username=username)
self.llm = LLM(system_message="You are reading RSS Feeds to analyze them.")
self.user_arango = self.get_arango_db(username)
self.rss_reader = RSSReader(username, self.base_arango, self.user_arango)

@ -0,0 +1,6 @@
from _arango import ArangoDB
for db in ['lasse', 'nisse', 'torill', 'irma']:
arango = ArangoDB(db_name=db)
arango.db.create_collection('rss_feeds')

@ -238,11 +238,15 @@ class Processor:
local_chroma_deployment: bool = False,
process: bool = True,
document_type: str = None,
username: str = None,
):
self.document = document
self.chromadb = ChromaDB(local_deployment=local_chroma_deployment, db=chroma_db)
self.len_chunks = len_chunks
self.document_type = document_type
self.filename = filename
self.username = username if username else document.username
self._id = None
@ -353,7 +357,7 @@ class Processor:
)
else:
chroma_collection = self.chromadb.db.get_or_create_collection(
"other_documents"
f"{self.username}__other_documents"
)
chroma_collection.add(ids=ids, documents=documents, metadatas=metadatas)
@ -454,11 +458,13 @@ class Processor:
model="small",
max_length_answer=500,
)
text = pymupdf4llm.to_markdown(
self.document.pdf, page_chunks=False, show_progress=False, pages=[0, 1]
)
if len(self.document.pdf) == 1:
pages = [0]
else:
pages = [0, 1]
text = pymupdf4llm.to_markdown(
self.document.pdf, page_chunks=False, show_progress=False, pages=pages
)
prompt = f'''
Below is the beginning of an article. I want to know when it's published, the title, and the journal.
@ -468,7 +474,8 @@ class Processor:
Answer ONLY with the information requested.
I want to know the published date on the form "YYYY-MM-DD".
I want the full title of the article and the journal.
I want the full title of the article.
I want the name of the journal/paper/outlet where the article was published.
Be sure to answer on the form "published_date;title;journal" as the answer will be used in a CSV.
If you can't find the information, answer "not_found".
'''
@ -553,9 +560,10 @@ class Processor:
if response.status_code == 200:
data = response.json()
if data.get("results", []) == []:
print(f"DOI {doi} not found in DOAJ.")
print_yellow(f"{doi} not found in DOAJ.")
return False
else:
print_green(f"{doi} found in DOAJ.")
return data
else:
print(
@ -648,11 +656,18 @@ class Processor:
only_meta=True
)
if "_key" not in self.document.doc:
_key = (
self.document.doi
or self.document.title
or self.document.get_title()
)
if self.document.doi:
_key = self.document.doi
elif self.document.title:
_key = self.document.title
elif self.document.get_title():
_key = self.document.get_title()
elif 'title' in self.document.doc["metadata"] and self.document.doc["metadata"]["title"]:
_key = self.document.doc["metadata"]["title"]
else:
_key = self.document.pdf_file.name
print_yellow(f"Document key: {_key}")
print(self.document.doi, self.document.title, self.document.get_title())
self.document.doc["_key"] = fix_key(_key)

@ -0,0 +1,192 @@
import yaml
import sys
import bcrypt
from _arango import ArangoDB
import os
import dotenv
import getpass
import argparse
import string
import secrets
from utils import fix_key
from colorprinter.print_color import *
dotenv.load_dotenv()
def read_yaml(file_path):
with open(file_path, "r") as file:
return yaml.safe_load(file)
def write_yaml(file_path, data):
with open(file_path, "w") as file:
yaml.safe_dump(data, file)
def add_user(data, username, email, name, password):
# Check for existing username
if username in data["credentials"]["usernames"]:
print(f"Error: Username '{username}' already exists.")
sys.exit(1)
# Check for existing email
for user in data["credentials"]["usernames"].values():
if user["email"] == email:
print(f"Error: Email '{email}' already exists.")
sys.exit(1)
# Hash the password using bcrypt
hashed_password = bcrypt.hashpw(password.encode("utf-8"), bcrypt.gensalt()).decode(
"utf-8"
)
# Add the new user
data["credentials"]["usernames"][username] = {
"email": email,
"name": name,
"password": hashed_password,
}
def make_arango(username):
root_user = os.getenv("ARANGO_ROOT_USER")
root_password = os.getenv("ARANGO_ROOT_PASSWORD")
arango = ArangoDB(user=root_user, password=root_password, db_name="_system")
if not arango.db.has_database(username):
arango.db.create_database(
username,
users=[
{
"username": os.getenv("ARANGO_USER"),
"password": os.getenv("ARANGO_PASSWORD"),
"active": True,
"extra": {},
}
],
)
arango = ArangoDB(user=root_user, password=root_password, db_name=username)
for collection in [
"projects",
"favorite_articles",
"article_collections",
"settings",
"chats",
"notes",
"other_documents",
"rss_feeds",
]:
if not arango.db.has_collection(collection):
arango.db.create_collection(collection)
user_arango = ArangoDB(db_name=username)
user_arango.db.collection("settings").insert(
{"current_page": "Bot Chat", "current_project": None}
)
def generate_random_password(length=16):
characters = string.ascii_letters + string.digits
password = "-".join(
"".join(secrets.choice(characters) for _ in range(6)) for _ in range(3)
)
return password
def delete_user(data, username):
# Check if the user exists
if username not in data["credentials"]["usernames"]:
print(f"Error: Username '{username}' does not exist.")
sys.exit(1)
# Remove the user from the YAML data
del data["credentials"]["usernames"][username]
# Remove the user's database in ArangoDB
root_user = os.getenv("ARANGO_ROOT_USER")
root_password = os.getenv("ARANGO_ROOT_PASSWORD")
base_arango = ArangoDB(user=root_user, password=root_password, db_name="base")
# Remove the user's database in ArangoDB
root_user = os.getenv("ARANGO_ROOT_USER")
root_password = os.getenv("ARANGO_ROOT_PASSWORD")
arango = ArangoDB(user=root_user, password=root_password, db_name="_system")
if arango.db.has_database(username):
arango.db.delete_database(username)
# Remove user access from documents in relevant collections
collections = ["sci_articles", "other_documents"]
for collection_name in collections:
documents = base_arango.db.aql.execute(
"""
FOR doc IN @@collection_name
FILTER @username IN doc.user_access
RETURN {'_id': doc._id, 'user_access': doc.user_access}
""",
bind_vars={"username": username, "@collection_name": collection_name},
)
for document in documents:
if 'user_access' in document:
# Remove username from the list user_access
document['user_access'].remove(username)
base_arango.db.collection(collection_name).update(document)
print_green(f"User {username} deleted successfully.")
def main():
parser = argparse.ArgumentParser(description="Add or delete a user.")
parser.add_argument("--user", help="Username")
parser.add_argument("--email", help="Email address")
parser.add_argument("--name", help="Full name")
parser.add_argument("--password", help="Password")
parser.add_argument("--delete", action="store_true", help="Delete user")
args = parser.parse_args()
yaml_file = "streamlit_users.yaml"
data = read_yaml(yaml_file)
if args.delete:
if args.user:
username = args.user
delete_user(data, username)
write_yaml(yaml_file, data)
else:
print("Error: Username is required to delete a user.")
sys.exit(1)
else:
if args.user and args.email and args.name:
username = args.user
email = args.email
name = args.name
if args.password and len(args.password) >= 8:
password = args.password
else:
password = generate_random_password()
print_yellow("Generated password:", password)
else:
username = input("Enter username: ")
email = input("Enter email: ")
name = input("Enter name: ")
password = getpass.getpass("Enter password: ")
if not password or password == "":
password = generate_random_password()
print_yellow("Generated password:", password)
if username == 'test':
delete_user(data, username)
email = email.lower().strip()
checked_username = fix_key(username)
if checked_username != username:
username = checked_username
print_red(f"Username '{username}' contains invalid characters.")
print_yellow(f"Using '{checked_username}' instead.")
add_user(data, username, email, name, password)
make_arango(username)
write_yaml(yaml_file, data)
print_green(f"User {username} added successfully.")
if __name__ == "__main__":
main()

@ -1,97 +0,0 @@
import yaml
import sys
import bcrypt
from _arango import ArangoDB
import os
import dotenv
import getpass
dotenv.load_dotenv()
def read_yaml(file_path):
with open(file_path, "r") as file:
return yaml.safe_load(file)
def write_yaml(file_path, data):
with open(file_path, "w") as file:
yaml.safe_dump(data, file)
def add_user(data, username, email, name, password):
# Check for existing username
if username in data["credentials"]["usernames"]:
print(f"Error: Username '{username}' already exists.")
sys.exit(1)
# Check for existing email
for user in data["credentials"]["usernames"].values():
if user["email"] == email:
print(f"Error: Email '{email}' already exists.")
sys.exit(1)
# Hash the password using bcrypt
hashed_password = bcrypt.hashpw(password.encode("utf-8"), bcrypt.gensalt()).decode(
"utf-8"
)
# Add the new user
data["credentials"]["usernames"][username] = {
"email": email,
"name": name,
"password": hashed_password,
}
def make_arango(username):
root_user = os.getenv("ARANGO_ROOT_USER")
root_password = os.getenv("ARANGO_ROOT_PASSWORD")
arango = ArangoDB(user=root_user, password=root_password, db_name="_system")
if not arango.db.has_database(username):
arango.db.create_database(
username,
users=[
{
"username": os.getenv("ARANGO_USER"),
"password": os.getenv("ARANGO_PASSWORD"),
"active": True,
"extra": {},
}
]
)
arango = ArangoDB(user=root_user, password=root_password, db_name=username)
for collection in ["projects", "favorite_articles", "article_collections", "settings", 'chats', 'notes', 'other_documents']:
if not arango.db.has_collection(collection):
arango.db.create_collection(collection)
user_arango = ArangoDB(db_name=username)
user_arango.db.collection("settings").insert(
{"current_page": 'Bot Chat', "current_project": None}
)
def main():
yaml_file = "streamlit_users.yaml"
if len(sys.argv) == 5:
username = sys.argv[1]
email = sys.argv[2]
name = sys.argv[3]
password = sys.argv[4]
else:
username = input("Enter username: ")
email = input("Enter email: ")
name = input("Enter name: ")
password = getpass.getpass("Enter password: ")
data = read_yaml(yaml_file)
add_user(data, username, email, name, password)
make_arango(username)
write_yaml(yaml_file, data)
print(f"User {username} added successfully.")
if __name__ == "__main__":
main()

@ -28,7 +28,7 @@ def get_assistant_prompt():
You should not write a reference section as this will be added later.
Format your answers in Markdown format. """
def get_editor_prompt(project: "Project", tools: bool = False):
def get_editor_prompt(project: "Project"):
"""Generates a coaching prompt for an editor to assist a reporter with a specific project.
Args:
@ -51,14 +51,32 @@ def get_editor_prompt(project: "Project", tools: bool = False):
return f'''You are an editor coaching a journalist who is working on the project "{project.name}". {description_string(project)}
{notes_string}
When writing with the reporter you will also get other information, like excerpts from articles and other documents. Use the notes to put the information in context and help the reporter to move forward.
When writing with the reporter you will _often_ get other information, like excerpts from articles and other documents. Use the notes to put the information in context and help the reporter to move forward.
If no other information is provided, try to answer based on the conversation history. If there is no history, and you're requested to answer in a conversational way, don't pretent to know things you don't have information about.
The project is a journalistic piece, so it is important that you help the reporter to be critical of the sources and to provide a balanced view of the topic.
Be sure to understand what the reporter is asking and provide the information in a way that is helpful for the reporter to move forward. Try to understand if the reporter is asking for a specific piece of information or if they are looking for guidance on how to move forward, or just want to discuss the topic.
If you need more information to answer the question, try to get it.
'''
def get_chat_prompt(user_input, content_string, role):
if role == "Research Assistant":
def get_chat_prompt(user_input, role, content_string=None, content_attachment=None, image_attachment=False):
if image_attachment:
return f'''{user_input}
Use the attached image to write your response.
'''
elif content_attachment:
return f'''{user_input}
Content of the attached file:
"""
{content_attachment}
"""
Respond to "{user_input}" based on the information in the attachment.
Fomat your answer in a way that is easy to understand for a general audience, and in an basic Markdown format.
'''
elif role == "Research Assistant":
prompt = f'''{user_input}
Below are snippets from different articles, often with title and date of publication.
@ -151,10 +169,9 @@ def get_image_system_prompt(project: "Project"):
return re.sub(r"\s*\n\s*", "\n", system_message)
def get_tools_prompt(user_input):
return f'''The reporter has asked: "{user_input}"
What information is needed to answer the question? Choose one or many tools in order to answer the question. Make sure to read the description of the tools carefully before choosing.
If you are shure that you can answer the question in a correct way without fetching data, you can do that as well.
return f'''User message: "{user_input}"
Choose one or many tools in order to answer the message. It's important that you think of what information (if any) is needed to make a good answer.
Make sure to read the description of the tools carefully before choosing!
'''

@ -8,12 +8,17 @@ from time import sleep
from colorprinter.print_color import *
from _arango import ArangoDB
def get_settings():
"""
Function to get the settings from the ArangoDB.
"""
arango = ArangoDB(db_name=st.session_state["username"])
st.session_state["settings"] = arango.db.collection("settings").get("settings")
settings = arango.db.collection("settings").get("settings")
if settings:
st.session_state["settings"] = settings
else:
st.session_state["settings"] = {'current_collection': None, 'current_page': None}
return st.session_state["settings"]
@ -49,7 +54,14 @@ if st.session_state["authentication_status"]:
for _ in range(3):
try:
from streamlit_pages import Article_Collections, Bot_Chat, Projects, Settings
from streamlit_pages import (
Article_Collections,
Bot_Chat,
Projects,
Settings,
RSS_Feeds
)
break
except ImportError as e:
# Write the full error traceback
@ -57,25 +69,54 @@ if st.session_state["authentication_status"]:
print_red(e)
print("Retrying to import pages...")
get_settings()
if 'current_page' in st.session_state["settings"]:
st.session_state["settings"] = get_settings()
if isinstance(st.session_state["settings"], dict) and "current_page" in st.session_state["settings"]:
st.session_state["current_page"] = st.session_state["settings"]["current_page"]
else:
if 'current_page' not in st.session_state:
if "current_page" not in st.session_state:
st.session_state["current_page"] = None
if "not_downloaded" not in st.session_state:
st.session_state["not_downloaded"] = {}
# Pages
bot_chat = st.Page(Bot_Chat)
projects = st.Page(Projects)
article_collections = st.Page(Article_Collections)
settings = st.Page(Settings)
pg = st.navigation([bot_chat, projects, article_collections, settings])
pg.run()
rss_feeds = st.Page(RSS_Feeds)
pg = st.navigation([bot_chat, projects, article_collections, rss_feeds, settings])
try:
pg.run()
except Exception as e:
print_red(e)
st.error("An error occurred. The site will be reloaded.")
import traceback
from datetime import datetime
from time import sleep
traceback_string = traceback.format_exc()
traceback.print_exc()
arango = ArangoDB(db_name="base")
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
print_rainbow(st.session_state.to_dict())
session_state = st.session_state.to_dict()
if 'bot' in session_state:
del session_state['bot']
arango.db.collection("error_logs").insert(
{
"error": traceback_string,
"_key": timestamp,
"session_state": session_state,
},
overwrite=True,
)
with st.status(":red[An error occurred. The site will be reloaded.]"):
for i in range(5):
sleep(1)
st.write(f"Reloading in {5-i} seconds...")
st.rerun()
with st.sidebar:
st.write("---")
authenticator.logout()
@ -84,4 +125,4 @@ if st.session_state["authentication_status"]:
elif st.session_state["authentication_status"] is False:
st.error("Username/password is incorrect")
elif st.session_state["authentication_status"] is None:
st.warning("Please enter your username and password")
st.warning("Please enter your username and password")

@ -13,10 +13,15 @@ class Chat(BaseClass):
self.name = kwargs.get("name", None)
self.chat_history = kwargs.get("chat_history", [])
self.role = role
self.message_attachments = None
def add_message(self, role, content):
self.chat_history.append(
{"role": role, "content": content.strip().strip('"'), "role_type": self.role}
{
"role": role,
"content": content.strip().strip('"'),
"role_type": self.role,
}
)
def to_dict(self):
@ -102,26 +107,27 @@ class Bot(BaseClass):
if not self.collection and self.project:
self.collection = self.project.collections
if not isinstance(self.collection, list):
self.collection = [self.collection]
# Load articles in the collections
self.arango_ids = []
self.arango_ids = []
for collection in self.collection:
for _id in self.user_arango.db.aql.execute(
'''
"""
FOR doc IN article_collections
FILTER doc.name == @collection
FOR article IN doc.articles
RETURN article._id
''',
""",
bind_vars={"collection": collection},
):
):
self.arango_ids.append(_id)
self.chosen_backend = kwargs.get("chosen_backend", None)
self.chatbot: LLM = None
self.chatbot: LLM = LLM()
self.tools: list[dict] = None
self.chatbot_memory = None
@ -138,13 +144,17 @@ class Bot(BaseClass):
self.toolbot = LLM(
temperature=0,
system_message="Choose one or many tools to use in order to assist the user. Make sure to read the description of the tools carefully.",
system_message="""
You are an assistant bot helping an answering bot to answer a user's messages.
Your task is to choose one or multiple tools that will help the answering bot to provide the user with the best possible answer.
Try to understand if the answering bot needs any information to answer the user's message, and if so, choose the tool that will provide that information.
You should NEVER try to answer the user's message yourself, only choose the tool that will help the answering bot to answer the user's message.
** Make sure to read the description of the tools carefully! **
You MUST choose a tool, if no additional information is needed, choose "conversational_response".""",
chat=False,
model="small",
)
# self.sidebar_content()
def sidebar_content(self):
@ -265,27 +275,52 @@ class Bot(BaseClass):
return grouped_chunks
def process_user_input(self, user_input):
def process_user_input(self, user_input, content_attachment=None):
# Add user's message to chat history
self.chat.add_message("user", user_input)
# Generate response with tool support
prompt = get_tools_prompt(user_input)
response = self.toolbot.generate(prompt, tools=self.tools, stream=False)
print_yellow("Tool to use")
# Check if the LLM wants to use a tool
if isinstance(response, dict) and "tool_calls" in response:
bot_response = self.answer_tool_call(response, user_input)
if not content_attachment:
prompt = get_tools_prompt(user_input)
response = self.toolbot.generate(prompt, tools=self.tools, stream=False)
# Check if the LLM wants to use a tool
if isinstance(response, dict) and "tool_calls" in response:
print_yellow("Tool(s) to use:", response["tool_calls"])
bot_response = self.answer_tool_call(
response, user_input=user_input
)
else:
# Use the LLM's direct response
bot_response = response.strip('"')
with st.chat_message(
"assistant", avatar=self.chat.get_avatar(role="assitant")
):
st.write(bot_response)
else:
# Use the LLM's direct response
bot_response = response.strip('"')
with st.chat_message(
"assistant", avatar=self.chat.get_avatar(role="assitant")
"assistant", avatar=self.chat.get_avatar(role="assistant")
):
st.write(bot_response)
with st.spinner("Reading the content..."):
if self.chat.message_attachments == 'image':
prompt = get_chat_prompt(
user_input, role=self.chat.role, image_attachment=True
)
print_yellow("Content attachment:", type(content_attachment))
bot_response = self.chatbot.generate(
prompt,
stream=False,
images=[content_attachment],
model="vision",
)
st.write(bot_response)
else:
prompt = get_chat_prompt(
user_input, content_attachment=content_attachment, role=self.chat.role
)
response = self.chatbot.generate(prompt, stream=True)
bot_response = st.write_stream(response)
# Add assistant's message to chat history
if self.chat.chat_history[-1]["role"] != "assistant":
self.chat.add_message("assistant", bot_response)
@ -323,6 +358,7 @@ class Bot(BaseClass):
]:
chunks = getattr(self, function_name)(**arguments)
# Provide the tool's output back to the LLM
response = self.generate_from_chunks(user_input, chunks)
bot_response = st.write_stream(response)
bot_response = bot_response.strip('"')
@ -357,7 +393,7 @@ class Bot(BaseClass):
for note in notes:
notes_string += f"\n# {note['title']}\n{note['content']}\n---\n"
prompt = get_chat_prompt(user_input, notes_string, role=self.chat.role)
prompt = get_chat_prompt(user_input, content_string=notes_string, role=self.chat.role)
with st.spinner("Reading project notes..."):
return self.chatbot.generate(prompt, stream=True)
@ -377,7 +413,7 @@ class Bot(BaseClass):
f"{chunks_content_string}\n---\n"
)
prompt = get_chat_prompt(user_input, chunks_string, role=self.chat.role)
prompt = get_chat_prompt(user_input, content_string=chunks_string, role=self.chat.role)
magazines = list(
set(
@ -394,7 +430,7 @@ class Bot(BaseClass):
s = "Reading articles..."
with st.spinner(s):
return (
self.chatbot.generate(prompt, stream=True)
self.chatbot.generate(prompt, user_input=user_input, context=chunks_string, stream=True)
if self.chatbot
else self.llm.generate(prompt, stream=True)
)
@ -405,8 +441,43 @@ class Bot(BaseClass):
# Display chat history
self.chat.show_chat_history()
self.attachment = 'image'
if user_input := st.chat_input("Write your message here...", accept_file=True):
user_input.text = user_input.text.replace('"""', '---')
if len(user_input.files) > 1:
st.error("Please upload only one file at a time.")
if user_input.files:
print(user_input.files)
attached_file = user_input.files[0]
if attached_file.type == "application/pdf":
# Read the PDF content
pdf_content = attached_file.read()
# Open the PDF with PyMuPDF
import fitz
pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
content_attachment = ""
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
content_attachment += page.get_text()
print_green("PDF text:", content_attachment)
elif (
attached_file.type == "image/png"
or attached_file.type == "image/jpeg"
):
self.chat.message_attachments = 'image'
content_attachment = attached_file.read()
with st.chat_message(
"user", avatar=self.chat.get_avatar(role="user")
):
st.image(content_attachment)
else:
content_attachment = None
user_input = user_input.text
if user_input := st.chat_input("Write your message here..."):
with st.chat_message("user", avatar=self.chat.get_avatar(role="user")):
st.write(user_input)
if not self.chat.name:
@ -428,7 +499,7 @@ class Bot(BaseClass):
}
)
self.chat_key = chat_doc["_key"]
self.process_user_input(user_input)
self.process_user_input(user_input, content_attachment)
self.update_session_state()
def get_notes(self):
@ -453,6 +524,11 @@ class Bot(BaseClass):
},
)
def fetch_science_articles(self, query: str, n_documents: int):
n_documents = int(n_documents)
if n_documents < 3:
n_documents = 3
elif n_documents > 10:
n_documents = 10
return self.get_chunks(
query, collections=["sci_articles"], n_results=n_documents
)
@ -472,8 +548,13 @@ class Bot(BaseClass):
},
)
def fetch_other_documents(self, query: str, n_documents: int):
n_documents = int(n_documents)
if n_documents < 2:
n_documents = 2
elif n_documents > 10:
n_documents = 10
return self.get_chunks(
query, collections=["other_documents"], n_results=n_documents
query, collections=[f"{self.username}__other_documents"], n_results=n_documents
)
@ToolRegistry.register(
@ -491,25 +572,35 @@ class Bot(BaseClass):
},
)
def fetch_science_articles_and_other_documents(self, query: str, n_documents: int):
n_documents = int(n_documents)
if n_documents < 3:
n_documents = 3
elif n_documents > 10:
n_documents = 10
return self.get_chunks(
query,
collections=["sci_articles", "other_documents"],
collections=["sci_articles", f"{self.username}__other_documents"],
n_results=n_documents,
)
@ToolRegistry.register(
name="fetch_notes",
description="Fetches information from the project notes when you as an editor need context from the project notes to understand other information. ONLY use this together with other tools!",
description="Fetches information from the project notes when you as an editor need context from the project notes to understand other information. ONLY use this together with other tools! No arguments needed.",
)
def fetch_notes(self):
return self.get_notes()
@ToolRegistry.register(
name="conversational_response",
description="Generates a conversational response without fetching data. Use this ONLY if it is obvious that the user is not looking for information but only wants to chat.",
description="Let the answering bot write a response without fetching data. Use this ONLY if it is obvious that the user is not looking for information but only wants to smalltalk (like saying 'hi'). No arguments or needed.",
)
def conversational_response(self, query: str):
query = f'User message: "{query}". Make your answer short and conversational. Include a very brief description of the project if you think that would be helpful.'
query = f"""
User message: "{query}".
Make your answer short and conversational.
This is perhaps not a conversation about a journalistic project, so don't try to be too informative.
Don't answer with anything you're not sure of!
"""
result = (
self.chatbot.generate(query, stream=True)
if self.chatbot
@ -550,7 +641,6 @@ class ResearchAssistantBot(Bot):
self.tools = ToolRegistry.get_tools(
tools=[
"fetch_science_articles",
"fetch_other_documents",
"fetch_science_articles_and_other_documents",
]
)
@ -572,7 +662,11 @@ class PodBot(Bot):
self.instructions = instructions
self.guest_name = kwargs.get("name_guest", "Merit")
self.hostbot = HostBot(
Chat(username=self.username, role="Host"), subject, username, instructions=instructions, **kwargs
Chat(username=self.username, role="Host"),
subject,
username,
instructions=instructions,
**kwargs,
)
self.guestbot = GuestBot(
Chat(username=self.username, role="Guest"),
@ -583,7 +677,7 @@ class PodBot(Bot):
)
def run(self):
notes = self.get_notes()
notes_string = ""
if self.instructions:
@ -607,12 +701,11 @@ class PodBot(Bot):
Say hello to the expert and start the interview. Remember to keep the interview to the subject of {self.subject} throughout the conversation.
'''
# Stop button for the podcast
with st.sidebar:
stop = st.button("Stop the podcast")
if stop:
st.session_state["make_podcast"] = False
stop = st.button("Stop podcast", on_click=self.stop_podcast)
while st.session_state["make_podcast"]:
# Stop the podcast if there are more than 14 messages in the chat
self.chat.show_chat_history()
if len(self.chat.chat_history) == 14:
@ -633,8 +726,7 @@ class PodBot(Bot):
stream=False,
)
if "tool_calls" in _q:
print_yellow("Tool call response (host)", _q)
print_purple("HOST", self.hostbot.chat.role)
print_yellow("Tool call response (host)", _q['tool_calls'])
q = self.hostbot.answer_tool_call(_q, a)
else:
q = _q
@ -653,26 +745,33 @@ class PodBot(Bot):
a = _a
self.chat.add_message("Guest", a)
self.update_session_state()
def stop_podcast(self):
st.session_state["make_podcast"] = False
self.update_session_state()
print_rainbow(st.session_state.to_dict())
self.chat.show_chat_history()
class HostBot(Bot):
def __init__(self, chat: Chat, subject: str, username: str, instructions: str, **kwargs):
def __init__(
self, chat: Chat, subject: str, username: str, instructions: str, **kwargs
):
super().__init__(chat=chat, username=username, **kwargs)
self.chat.role = kwargs.get("role", "Host")
self.tools = ToolRegistry.get_tools(
tools=[
"fetch_notes",
"conversational_response",
"fetch_other_documents",
#"fetch_other_documents", #TODO Should this be included?
]
)
self.instructions = instructions
self.llm = LLM(
system_message=f'''
You are the host of a podcast and an expert on {subject}. You will ask one question at a time about the subject, and then wait for the answer.
You are the host of a podcast and an expert on {subject}. You will ask one question at a time about the subject, and then wait for the guest to answer.
Don't ask the guest to talk about herself/himself, only about the subject.
Make your questions short and clear, only if necessary add a brief context to the question.
These are the instructions for the podcast from the producer:
"""
{self.instructions}
@ -682,11 +781,11 @@ class HostBot(Bot):
)
self.toolbot = LLM(
temperature=0,
system_message='''
system_message="""
You are assisting a podcast host in asking questions to an expert.
Choose one or many tools to use in order to assist the host in asking relevant questions.
Often "conversational_response" is enough, but sometimes notes are needed or even other documents.
Make sure to read the description of the tools carefully!''',
Often "conversational_response" is enough, but sometimes project notes are needed.
Make sure to read the description of the tools carefully!""",
chat=False,
model="small",
)
@ -709,6 +808,7 @@ class GuestBot(Bot):
system_message=f"""
You are {kwargs.get('name', 'Merit')}, an expert on {subject}.
Today you are a guest in a podcast about {subject}. A host will ask you questions about the subject and you will answer by using scientific facts and information.
When answering, don't say things like "based on the documents" or alike, as neither the host nor the audience can see the documents. Act just as if you were talking to someone in a conversation.
Try to be concise when answering, and remember that the audience of the podcast is not expert on the subject, so don't complicate things too much.
It's very important that you answer in a "spoken" way, as if you were talking to someone in a conversation. That means you should avoid using scientific jargon and complex terms, too many figures or abstract concepts.
Lists are also not recommended, instead use "for the first reason", "secondly", etc.

@ -18,7 +18,7 @@ def Bot_Chat():
Function to handle the Chat Bot page.
"""
from _classes import BotChatPage
if 'bot_chat_page' not in st.session_state:
if 'Bot Chat' not in st.session_state:
st.session_state['Bot Chat'] = {}
chatpage = BotChatPage(username=st.session_state["username"])
chatpage.run()
@ -28,7 +28,7 @@ def Article_Collections():
Function to handle the Article Collections page.
"""
from _classes import ArticleCollectionsPage
if 'article_collections' not in st.session_state:
if 'Article Collections' not in st.session_state:
st.session_state['Article Collections'] = {}
article_collection = ArticleCollectionsPage(username=st.session_state["username"])
@ -42,3 +42,15 @@ def Settings():
from _classes import SettingsPage
settings = SettingsPage(username=st.session_state["username"])
settings.run()
def RSS_Feeds():
"""
Function to handle the RSS Feeds page.
"""
from _classes import RSSFeedsPage
if 'RSS Feeds' not in st.session_state:
st.session_state['RSS Feeds'] = {}
rss_feeds_page = RSSFeedsPage(username=st.session_state["username"])
rss_feeds_page.run()

@ -0,0 +1,345 @@
import os
import urllib
import streamlit as st
from _base_class import BaseClass
import feedparser
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from utils import fix_key
from colorprinter.print_color import *
from datetime import datetime, timedelta
class RSSFeedsPage(BaseClass):
def __init__(self, username: str):
super().__init__(username=username)
self.page_name = "RSS Feeds"
# Initialize attributes from session state if available
for k, v in st.session_state.get(self.page_name, {}).items():
setattr(self, k, v)
def run(self):
if "selected_feed" not in st.session_state:
st.session_state["selected_feed"] = None
self.update_current_page(self.page_name)
self.display_feed()
self.sidebar_actions()
# Persist state to session_state
self.update_session_state(page_name=self.page_name)
def select_rss_feeds(self):
# Fetch RSS feeds from the user's ArangoDB collection
rss_feeds = self.get_rss_feeds()
if rss_feeds:
feed_options = [feed["title"] for feed in rss_feeds]
with st.sidebar:
st.subheader("Show your feeds")
selected_feed_title = st.selectbox(
"Select a feed", options=feed_options, index=None
)
if selected_feed_title:
st.session_state["selected_feed"] = [
feed["_key"]
for feed in rss_feeds
if feed["title"] == selected_feed_title
][0]
st.rerun()
else:
st.write("You have no RSS feeds added.")
def get_rss_feeds(self):
return list(self.user_arango.db.collection("rss_feeds").all())
def sidebar_actions(self):
with st.sidebar:
# Select a feed to show
self.select_rss_feeds()
st.subheader("Add a New RSS Feed")
rss_url = st.text_input("Website URL or RSS Feed URL")
if st.button("Discover Feeds"):
if rss_url:
with st.spinner("Discovering feeds..."):
feeds = self.discover_feeds(rss_url)
if feeds:
st.session_state["discovered_feeds"] = feeds
st.rerun()
else:
st.error("No RSS feeds found at the provided URL.")
if "discovered_feeds" in st.session_state:
st.subheader("Select a Feed to Add")
feeds = st.session_state["discovered_feeds"]
feed_options = [f"{feed['title']} ({feed['href']})" for feed in feeds]
selected_feed = st.selectbox("Available Feeds", options=feed_options)
selected_feed_url = feeds[feed_options.index(selected_feed)]["href"]
if st.button("Preview Feed"):
feed_data = feedparser.parse(selected_feed_url)
st.write(f"{feed_data.feed.get('title', 'No title')}")
description = html_to_markdown(
feed_data.feed.get("description", "No description")
)
st.write(f"_{description}_")
for entry in feed_data.entries[:5]:
print("ENTRY:")
with st.expander(entry.title):
summary = (
entry.summary
if "summary" in entry
else "No summary available"
)
markdown_summary = html_to_markdown(summary)
st.markdown(markdown_summary)
if st.button(
"Add RSS Feed",
on_click=self.add_rss_feed,
args=(selected_feed_url, feed_data, description),
):
del st.session_state["discovered_feeds"]
st.success("RSS Feed added.")
st.rerun()
def discover_feeds(self, url):
try:
if not url.startswith("http"):
url = "https://" + url
# Check if the input URL is already an RSS feed
f = feedparser.parse(url)
if len(f.entries) > 0:
return [
{
"href": url,
"title": f.feed.get("title", "No title"),
"icon": self.get_site_icon(url),
}
]
# If not, proceed to discover feeds from the webpage
raw = requests.get(url).text
result = []
possible_feeds = []
html = BeautifulSoup(raw, "html.parser")
# Find the site icon
icon_url = self.get_site_icon(url, html)
# Find all <link> tags with rel="alternate" and type containing "rss" or "xml"
feed_urls = html.findAll("link", rel="alternate")
for f in feed_urls:
t = f.get("type", None)
if t and ("rss" in t or "xml" in t):
href = f.get("href", None)
if href:
possible_feeds.append(urljoin(url, href))
# Find all <a> tags with href containing "rss", "xml", or "feed"
parsed_url = urllib.parse.urlparse(url)
base = parsed_url.scheme + "://" + parsed_url.hostname
atags = html.findAll("a")
for a in atags:
href = a.get("href", None)
if href and ("rss" in href or "xml" in href or "feed" in href):
possible_feeds.append(urljoin(base, href))
# Validate the possible feeds using feedparser
for feed_url in list(set(possible_feeds)):
f = feedparser.parse(feed_url)
if len(f.entries) > 0:
result.append(
{
"href": feed_url,
"title": f.feed.get("title", "No title"),
"icon": icon_url,
}
)
return result
except Exception as e:
print(f"Error discovering feeds: {e}")
return []
def add_rss_feed(self, url, feed_data, description):
try:
icon_url = feed_data["feed"]["image"]["href"]
except Exception as e:
icon_url = self.get_site_icon(url)
title = feed_data["feed"].get("title", "No title")
print_blue(title)
icon_path = download_icon(icon_url) if icon_url else None
_key = fix_key(url)
now_timestamp = datetime.now().isoformat() # Convert datetime to ISO format string
self.user_arango.db.collection("rss_feeds").insert(
{
"_key": _key,
"url": url,
"title": title,
"icon_path": icon_path,
"description": description,
'fetched_timestamp': now_timestamp, # Add the timestamp field
'feed_data': feed_data,
},
overwrite=True,
)
feed = self.get_feed_from_arango(_key)
now_timestamp = datetime.now().isoformat() # Convert datetime to ISO format string
if feed:
self.update_feed(_key, feed)
else:
self.base_arango.db.collection("rss_feeds").insert(
{
"_key": _key,
"url": url,
"title": title,
"icon_path": icon_path,
"description": description,
'fetched_timestamp': now_timestamp, # Add the timestamp field
"feed_data": feed_data,
},
overwrite=True,
overwrite_mode="update",
)
def update_feed(self, feed_key, feed=None):
"""
Updates RSS feed that already exists in the ArangoDB base database.
Args:
feed_key (str): The key identifying the feed in the database.
Returns:
dict: The parsed feed data.
Raises:
Exception: If there is an error updating the feed in the database.
"""
if not feed:
feed = self.get_feed_from_arango(feed_key)
feed_data = feedparser.parse(feed["url"])
print_rainbow(feed_data['feed'])
feed["feed_data"] = feed_data
if self.username not in feed.get("users", []):
feed["users"] = feed.get("users", []) + [self.username]
fetched_timestamp = datetime.now().isoformat() # Convert datetime to ISO format string
# Update the fetched_timestamp in the database
self.base_arango.db.collection("rss_feeds").update(
{
"_key": feed["_key"],
"fetched_timestamp": fetched_timestamp,
"feed_data": feed_data,
}
)
return feed_data
def update_session_state(self, page_name=None):
# Update session state
if page_name:
st.session_state[page_name] = self.__dict__
def get_site_icon(self, url, html=None):
try:
if not html:
raw = requests.get(url).text
html = BeautifulSoup(raw, "html.parser")
icon_link = html.find("link", rel="icon")
if icon_link:
icon_url = icon_link.get("href", None)
if icon_url:
return urljoin(url, icon_url)
# Fallback to finding other common icon links
icon_link = html.find("link", rel="shortcut icon")
if icon_link:
icon_url = icon_link.get("href", None)
if icon_url:
return urljoin(url, icon_url)
return None
except Exception as e:
print(f"Error getting site icon: {e}")
return None
def get_feed_from_arango(self, feed_key):
"""
Retrieve an RSS feed from the ArangoDB base databse.
Args:
feed_key (str): The key of the RSS feed to retrieve from the ArangoDB base database.
Returns:
dict: The RSS feed document retrieved from the ArangoDB base database.
"""
return self.base_arango.db.collection("rss_feeds").get(feed_key)
def get_feed(self, feed_key):
feed = self.get_feed_from_arango(feed_key)
feed_data = feed["feed_data"]
fetched_time = datetime.fromisoformat(feed['fetched_timestamp']) # Parse the timestamp string
if datetime.now() - fetched_time < timedelta(hours=1):
return feed_data
else:
return self.update_feed(feed_key)
def display_feed(self):
if st.session_state["selected_feed"]:
feed_data = self.get_feed(st.session_state["selected_feed"])
st.title(feed_data['feed'].get("title", "No title"))
st.write(feed_data['feed'].get("description", "No description"))
st.write("**Recent Entries:**")
for entry in feed_data['entries'][:5]:
with st.expander(entry['title']):
summary = (
entry['summary'] if "summary" in entry else "No summary available"
)
markdown_summary = html_to_markdown(summary)
st.markdown(markdown_summary)
st.markdown(f"[Read more]({entry['link']})")
def html_to_markdown(html):
soup = BeautifulSoup(html, "html.parser")
for br in soup.find_all("br"):
br.replace_with("\n")
for strong in soup.find_all("strong"):
strong.replace_with(f"**{strong.text}**")
for em in soup.find_all("em"):
em.replace_with(f"*{em.text}*")
for p in soup.find_all("p"):
p.replace_with(f"{p.text}\n\n")
return soup.get_text()
def download_icon(icon_url, save_folder="external_icons"):
try:
if not os.path.exists(save_folder):
os.makedirs(save_folder)
response = requests.get(icon_url, stream=True)
if response.status_code == 200:
icon_name = os.path.basename(icon_url)
icon_path = os.path.join(save_folder, icon_name)
with open(icon_path, "wb") as f:
for chunk in response.iter_content(1024):
f.write(chunk)
return icon_path
else:
print(f"Failed to download icon: {response.status_code}")
return None
except Exception as e:
print(f"Error downloading icon: {e}")
return None

@ -0,0 +1,91 @@
import asyncio
import re
from pdf_highlighter import Highlighter
from _chromadb import ChromaDB
from _llm import LLM
import ollama
from colorprinter.print_color import *
from concurrent.futures import ThreadPoolExecutor
# Wrap the synchronous generate method
async def async_generate(llm, prompt):
loop = asyncio.get_event_loop()
with ThreadPoolExecutor() as pool:
return await loop.run_in_executor(pool, llm.generate, prompt)
# Define the main asynchronous function to highlight the PDFs
async def highlight_pdf(data):
# Use the highlight method to highlight the relevant sentences in the PDFs
highlighted_pdf_buffer = await highlighter.highlight(
data=data, zero_indexed_pages=True # Pages are zero-based (e.g., 0, 1, 2, ...)
)
# Save the highlighted PDF to a new file
with open("highlighted_combined_documents.pdf", "wb") as f:
f.write(highlighted_pdf_buffer.getbuffer())
print_green("PDF highlighting completed successfully!")
# Initialize ChromaDB client
chromadb = ChromaDB()
# Define the query to fetch relevant text snippets and metadata from ChromaDB
query = "How are climate researchers advocating for change in the society?"
# Perform the query on ChromaDB
result = chromadb.query(query, collection="sci_articles", n_results=5)
# Use zip to combine the lists into a list of dictionaries
results = [
{"id": id_, "metadata": metadata, "document": document, "distance": distance}
for id_, metadata, document, distance in zip(
result["ids"][0],
result["metadatas"][0],
result["documents"][0],
result["distances"][0],
)
]
for r in results:
print_rainbow(r["metadata"])
print_yellow(type(r["metadata"]['pages']))
# Ask a LLM a question about the text snippets
llm = LLM(model="small")
documents_string = "\n\n---\n\n".join(result["documents"][0])
answer = llm.generate(
f'''{query} Write your answer from the information below?\n\n"""{documents_string}"""\n\n{query}'''
)
print_green(answer)
# Now you want to highlight relevant information in the PDFs to understand what the LLM is using!
# Each result from ChromaDB contains the PDF filename and the pages where the text is found
data = []
for result in results:
pages = result["metadata"].get("pages")
try:
pages = [int(pages)]
except:
# Use re to extraxt the page numbers separated by commas
pages = list(map(int, re.findall(r"\d+", pages)))
data.append(
{
"user_input": query,
"pdf_filename": result["metadata"]["_id"],
"pages": pages,
'chunk': result['document']
}
)
# Initialize the Highlighter
highlighter = Highlighter(
llm=llm, # Pass the LLM to the Highlighter
comment=False, # Enable comments to understand the context
use_llm=False
)
# Run the main function using asyncio
asyncio.run(highlight_pdf(data))

@ -0,0 +1,32 @@
import os
import base64
from ollama import Client
import env_manager
from colorprinter.print_color import *
env_manager.set_env()
# Encode the credentials
credentials = f"{os.getenv('LLM_API_USER')}:{os.getenv('LLM_API_PWD_LASSE')}"
encoded_credentials = base64.b64encode(credentials.encode()).decode()
# Set up the headers with authentication details
headers = {
'Authorization': f'Basic {encoded_credentials}'
}
# Get the host URL (base URL only)
host_url = os.getenv("LLM_API_URL").rstrip('/api/chat/')
# Initialize the client with the host and headers
client = Client(
host=host_url,
headers=headers
)
# Example usage of the client
try:
response = client.chat(model=os.getenv('LLM_MODEL') , messages=[{'role': 'user', 'content': 'Why is the sky blue?'}])
print_rainbow(response)
except Exception as e:
print(f"Error: {e}")
Loading…
Cancel
Save