From d905a8a546c702cd7a06f085d62a990347809c60 Mon Sep 17 00:00:00 2001 From: lasseedfast Date: Wed, 16 Oct 2024 10:22:06 +0200 Subject: [PATCH] Refactor chatbot.py to improve code structure, integrate Streamlit app, and enhance user experience --- chatbot.py | 133 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 105 insertions(+), 28 deletions(-) diff --git a/chatbot.py b/chatbot.py index ef3ebed..c08f123 100644 --- a/chatbot.py +++ b/chatbot.py @@ -1,26 +1,24 @@ import streamlit as st -import crossref_commons.retrieval from _llm import LLM from _chromadb import ChromaDB from _arango import ArangoDB -from pprint import pprint from colorprinter.print_color import * +import re -# Initialize databases and chatbot -chromadb = ChromaDB() -arango = ArangoDB() -chatbot = LLM(temperature=0.1) - -# Streamlit app setup -st.title("EV Cars Chatbot") -st.write("Ask a question about EV car battery production:") - -# User input -user_input = st.text_input("Ask something") -if user_input: +def get_stream(response): + for i in response: + yield str(i) +def get_chunks(user_input, n_results=5): + query = helperbot.generate(f"""A user asked this question: "{user_input}"". + Generate a query for the vector database. Make sure to follow the instructions you got earlier!""" + ) + # Strip the query from anything that is not a word character, number, or space + query = re.sub(r"[^\w\d\s]", "", query) + print_purple(query) + chunks = chromadb.db.get_collection("sci_articles").query( - query_texts=user_input, n_results=7 + query_texts=query, n_results=n_results ) combined_chunks = [ {"document": doc, "metadata": meta} @@ -29,7 +27,15 @@ if user_input: for i in combined_chunks: _key = i["metadata"]["_key"] arango_metadata = arango.db.collection("sci_articles").get(_key)["metadata"] - i["crossref_info"] = arango_metadata if arango_metadata else {'title': 'No title', 'published_date': 'No published date', 'journal': 'No journal'} + i["crossref_info"] = ( + arango_metadata + if arango_metadata + else { + "title": "No title", + "published_date": "No published date", + "journal": "No journal", + } + ) # Sort the combined_chunks list first by published_date, then by title sorted_chunks = sorted( @@ -42,26 +48,83 @@ if user_input: # Group the chunks by title grouped_chunks = {} + article_number = 1 # Initialize article counter for chunk in sorted_chunks: title = chunk["crossref_info"]["title"] + chunk["article_number"] = article_number # Add article number to chunk if title not in grouped_chunks: - grouped_chunks[title] = [] - grouped_chunks[title].append(chunk) + grouped_chunks[title] = {'article_number': article_number, 'chunks': []} + article_number += 1 # Increment article counter when a new title is encountered + grouped_chunks[title]['chunks'].append(chunk) + + return grouped_chunks - chunks_string = "" - for title, chunks in grouped_chunks.items(): - chunks_content_string = "\n(...)\n".join( - [chunk["document"] for chunk in chunks] - ) - chunks_string += f"""\n + +# Initialize session state for chat history +if "chat_history" not in st.session_state: + st.session_state.chat_history = [] + st.session_state.chatbot_memory = None + st.session_state.helperbot_memory = None + +# Initialize databases and chatbot +chromadb = ChromaDB() +arango = ArangoDB() + +chatbot = LLM( + temperature=0.1, + system_message="""You are chatting about electric cars. Only use the information from scientific articles you are provided with to answer questions. + Format your answers in Markdown format. Be sure to reference the source of the information with ONLY the number of the article in the running text (e.g. " [
]"). """, +) +if st.session_state.chat_history: + chatbot.messages = st.session_state.chatbot_memory + +helperbot = LLM( + temperature=0, + model="small", + max_length_answer=500, + system_message="""Take the user input and write it as a sentence that could be used as a query for a vector database. + The vector database will return text snippets that semantically match the query, so you CAN'T USE NEGATIONS or other complex language constructs. If there is a negation in the user input, exclude that part from the query. + If the user input seems to be a follow-up question or comment, use the context from the chat history to make a relevant query. + Answer ONLY with the query, no explanation or reasoning! + """, +) +if st.session_state.chat_history: + helperbot.messages = st.session_state.helperbot_memory + +# Streamlit app setup +st.title("🚗 Electric Cars Chatbot") + + +# User input +user_input = st.chat_input("") + +if user_input: + st.session_state.chat_history.append({"role": "user", "content": user_input}) + + for message in st.session_state.chat_history: + with st.chat_message(message["role"]): + if message['content']: + st.markdown(message["content"]) + + # Show a loading message + with st.spinner("Getting information from database..."): + relevant_chunks = get_chunks(user_input, n_results=5) #! Change n_results to 7 + + chunks_string = "" + for title, chunks in relevant_chunks.items(): + chunks_content_string = "\n(...)\n".join( + [chunk["document"] for chunk in chunks['chunks']] + ) + chunks_string += f"""\n # {title} -## {chunks[0]['crossref_info']['published_date']} in {chunks[0]['crossref_info']['journal']} +## Article number: {chunks['article_number']} +## {chunks['chunks'][0]['crossref_info']['published_date']} in {chunks['chunks'][0]['crossref_info']['journal']} {chunks_content_string}\n --- \n """ - prompt = f'''{user_input} + prompt = f'''{user_input} Below are snippets from different articles with title and date of publication. ONLY use the information below to answer the question. Do not use any other information. @@ -71,5 +134,19 @@ ONLY use the information below to answer the question. Do not use any other info {user_input} ''' - response = chatbot.generate(prompt) - st.write(response) \ No newline at end of file + + response = chatbot.generate(prompt, stream=True) # Assuming chatbot.generate returns a generator + with st.chat_message("assistant"): + bot_response = st.write_stream(get_stream(response)) + + sources = '###### Sources: \n' + for title, chunks in relevant_chunks.items(): + sources += f'''[{chunks['article_number']}] **{title}** :gray[{chunks['chunks'][0]['crossref_info']['journal']} ({chunks['chunks'][0]['crossref_info']['published_date']})] \n''' + st.markdown(sources) + bot_response = f'{bot_response}\n\n{sources}' + # Append user input and response to chat history + st.session_state.chat_history.append( + {"role": "assistant", "content": bot_response} + ) +st.session_state.chatbot_memory = chatbot.messages +st.session_state.helperbot_memory = helperbot.messages