sci/test_research.py

from _llm import LLM
from _arango import ArangoDB
from _chromadb import ChromaDB
from streamlit_chatbot import Bot
from pydantic import BaseModel, Field
from typing import Dict, List, Tuple
from colorprinter.print_color import *
from projects_page import Project
from _base_class import StreamlitBaseClass
from prompts import get_tools_prompt

class ResearchBase(Bot):
    def __init__(self, username, **args):
        super().__init__(username=username, **args)
        self.llm = LLM()
        self.arango = ArangoDB()
        self.chromadb = ChromaDB()
        self.messages = []

    def start(self):
        self.messages = [{"role": "system", "message": self.llm.system_message}]
        if self.llm.model in ["small", "standard", "vision", "reasoning", "tools"]:
            self.llm.get_model(self.llm.model)


class ResearchManager(ResearchBase):
    def __init__(self, username, project=None):
        super().__init__(username=username, project=project)
        self.llm.system_message = "You are an assistant helping a journalist writing a report based on extensive research."
        self.llm.model = "reasoning"
        self.start()

    def generate_plan(self, question):
        query = f"""
        A journalist wants to get a report that answers this question: "{question}"
        THIS IS *NOT* A QUESTION YOU CAN ANSWER! Instead, you need to make a plan for how to answer this question.
        Include what type of information you need from what available sources.
        Available sources are:
        - Scientific articles
        - Other articles the journalists has gathered, such as blog posts, news articles, etc.
        - The journalists own notes.
        - Transcribed interviews (already done, you can't produce new ones).
        All of the above sources are available in a database, but you need to specify what you need. Be as precise as possible.
        As you don't have access to the sources, include steps to retrieve excerpts from articles and retrieve those that might be interesting.
        Also include steps to verify the information.
        Make the plan easy to follow and structured.
        Remember: You are not answering the question, you are making *a plan* for how to answer the question using the available sources.
        """
        query += f"\nTo help you understand the subject, here is a summary of notes the journalist has done: {project.notes_summary}"
        query += """Please structure the plan like:
        ## Step 1:
        - Task1: Description of task
        - Task2: Description of task
        ## Step 2:
        - Task1: Description of task
        - Task2: Description of task
        Etc, with as many steps and tasks as needed.
        """
        return self.llm.generate(query).content


class ResearchAssistant(ResearchBase):
    def __init__(self, username):
        super().__init__(username)
        self.llm.system_message = "You are a Research Assistant"
        self.start()


class HelperBot(ResearchBase):
    def __init__(self, username):
        super().__init__(username)
        self.llm.system_message = "You are helping a researcher to structure a text. You will get a text and make it into structured data. Make sure not to change the meaning of the text and keeps all the details in the subtasks."
        self.llm.model = "small"
        self.start()

    def make_structured_plan(self, text, question=None):

        class Plan(BaseModel):
            steps: Dict[str, List[Tuple[str, str]]] = Field(
                description="Structured plan represented as steps with their corresponding tasks or facts",
                example={
                    "Step 1: Gather Existing Materials": [
                        ("Task 1", "Description of task"),
                        ("Task 2", "Description of task"),
                    ],
                    "Step 2: Extract Relevant Information": [
                        ("Task 1", "Description of task"),
                        ("Task 2", "Description of task"),
                    ],
                },
            )

        if question:
            query = f''' This is a proposed plan for how to write a report on "{question}":\n"""{text}"""\nPlease make the plan into structured data with subtasks. Make sure to keep all the details in the subtasks.'''
        else:
            query = f''' This is a proposed plan for how to write a report:\n"""{text}"""\nPlease make the plan into structured data with subtasks. Make sure to keep all the details in the subtasks.'''
        response = self.llm.generate(query, format=Plan.model_json_schema())
        print(response)
        structured_response = Plan.model_validate_json(response.content)
        print('PLAN')
        print_rainbow(structured_response)
        print()
        return structured_response


class ToolBot(ResearchBase):
    def __init__(self, username, tools: list):
        super().__init__(username, tools=tools)
        self.start()
        tools_names = [tool.__name__ for tool in self.tools]
        tools_name_string = "\n– ".join(tools_names)
        self.llm = LLM(
            temperature=0,
            system_message=f"""
            You are an helpful assistant with tools. The tools you can choose from are:
            {tools_name_string}
            Your task is to choose one or multiple tools to answering a user's query.
            DON'T come up with your own tools, only use the ones provided.
            """,
            chat=False,
            model="tools",
        )

    def propose_tools(self, task):
        query = f"""What tool(s) would you use to help with this task:
        "{task}"
        Answer in a structured way using the tool_calls field!
        """
        query = get_tools_prompt(task)
        response = self.llm.generate(query)
        print_yellow('Model:', self.llm.model)
        print_rainbow(response)
        return response.tool_calls

if __name__ == "__main__":

    base = StreamlitBaseClass(username="lasse")
    project = Project(
        username="lasse",
        project_name="Monarch butterflies",
        user_arango=base.get_arango(),
    )
    rm = ResearchManager(username="lasse", project=project)
    tb = ToolBot(
        username="lasse",
        tools=[
            "fetch_science_articles_tool",
            "fetch_notes_tool",
            "fetch_other_documents_tool",
            "fetch_science_articles_and_other_documents_tool",
        ]
    )
    # ra = ResearchAssistant(username="lasse")
    hb = HelperBot(username="lasse")

    question = "Tell me five interesting facts about the Monarch butterfly"

    # Generate plan
    plan = rm.generate_plan(question)
# -- Example of what a plan can look like --
# plan = """## Step-by-Step Plan for Answering the Question: "Tell Me Five Interesting Facts About the Monarch Butterfly"

# ### Step 1: Gather and Organize Existing Materials
# - **Task 1:** Retrieve all existing materials related to Monarch butterflies from the database using keywords such as "Monarch butterfly migration," "habitat loss," "milkweed," "insecticides," "climate change," "Monarch Butterfly Biosphere Reserve," and "migration patterns."
# - **Task 2:** Categorize these materials into scientific articles, other articles (blogs, news), own notes, and transcribed interviews for easy access.

# ### Step 2: Extract Relevant Excerpts
# - **Task 1:** From the retrieved scientific articles, extract information on migration patterns, genetic studies, and population decline factors.
# - **Task 2:** From blogs and news articles, look for interesting anecdotes or recent findings about conservation efforts and unique behaviors of Monarch butterflies.

# ### Step 3: Identify Potential Interesting Facts
# - **Task 1:** Review the extracted excerpts to identify potential facts such as migration patterns, threats faced by Monarchs, population decline statistics, conservation efforts, and unique behaviors.
# - **Task 2:** Compile a list of five compelling and accurate facts based on the extracted information.

# ### Step 4: Verify Information
# - **Task 1:** Cross-check each fact with multiple sources to ensure accuracy. For example, verify migration details across scientific articles and recent news reports.
# - **Task 2:** Look for consensus among sources regarding population trends and threats to Monarchs.

# ### Step 5: Structure the Report
# - **Task 1:** Organize the five selected facts into a coherent structure, ensuring each fact is clearly explained and engaging.
# - **Task 2:** Incorporate quotes or statistics from sources to add depth and credibility to each fact.

# ### Step 6: Review and Finalize
# - **Task 1:** Proofread the report for clarity, accuracy, and grammar.
# - **Task 2:** Ensure all information is presented in an engaging manner suitable for a journalistic report.

# This plan ensures that the journalist systematically gathers, verifies, and presents five interesting facts about Monarch butterflies, providing a comprehensive and accurate report.
#     """
    #print_blue(plan)
    if "</think>" in plan:
        plan = plan.split("</think>")[1]

    # Make structured plan
    structured_plan = hb.make_structured_plan(plan, question)


    for step, tasks in structured_plan.steps.items():
        print_blue("\n### Step:", step)
        for task in tasks:

            print_blue("Task:", task[0])
            print_yellow(task[1])

            tools = tb.propose_tools(task[1])
            print_green("Tools:", tools)
            print('\n')