sci/_llm.py

import os
import base64
import re
from typing import Literal, Optional
import requests
import tiktoken
from ollama import (
    Client,
    AsyncClient,
    ResponseError,
    ChatResponse,
    Tool,
    Options,
)

import env_manager
from colorprinter.print_color import *

env_manager.set_env()

tokenizer = tiktoken.get_encoding("cl100k_base")


class LLM:
    """
    LLM class for interacting with an instance of Ollama.

    Attributes:
        model (str): The model to be used for response generation.
        system_message (str): The system message to be used in the chat.
        options (dict): Options for the model, such as temperature.
        messages (list): List of messages in the chat.
        max_length_answer (int): Maximum length of the generated answer.
        chat (bool): Whether the chat mode is enabled.
        chosen_backend (str): The chosen backend server for the API.
        client (Client): The client for synchronous API calls.
        async_client (AsyncClient): The client for asynchronous API calls.
        tools (list): List of tools to be used in generating the response.

    Methods:
        __init__(self, system_message, temperature, model, max_length_answer, messages, chat, chosen_backend):
            Initializes the LLM class with the provided parameters.

        get_model(self, model_alias):
            Retrieves the model name based on the provided alias.

        count_tokens(self):
            Counts the number of tokens in the messages.

        get_least_conn_server(self):
            Retrieves the least connected server from the backend.

        generate(self, query, user_input, context, stream, tools, images, model, temperature):
            Generates a response based on the provided query and options.

        make_summary(self, text):
            Generates a summary of the provided text.

        read_stream(self, response):
            Handles streaming responses.

        async_generate(self, query, user_input, context, stream, tools, images, model, temperature):
            Asynchronously generates a response based on the provided query and options.

        prepare_images(self, images, message):
    """

    def __init__(
        self,
        system_message: str = "You are an assistant.",
        temperature: float = 0.01,
        model: Optional[
            Literal["small", "standard", "vision", "reasoning", "tools"]
        ] = "standard",
        max_length_answer: int = 4096,
        messages: list[dict] = None,
        chat: bool = True,
        chosen_backend: str = None,
        tools: list = None,
    ) -> None:
        """
        Initialize the assistant with the given parameters.

        Args:
            system_message (str): The initial system message for the assistant. Defaults to "You are an assistant.".
            temperature (float): The temperature setting for the model, affecting randomness. Defaults to 0.01.
            model (Optional[Literal["small", "standard", "vision", "reasoning"]]): The model type to use. Defaults to "standard".
            max_length_answer (int): The maximum length of the generated answer. Defaults to 4096.
            messages (list[dict], optional): A list of initial messages. Defaults to None.
            chat (bool): Whether the assistant is in chat mode. Defaults to True.
            chosen_backend (str, optional): The backend server to use. If not provided, the least connected server is chosen.

        Returns:
            None
        """

        self.model = self.get_model(model)
        self.call_model = (
            self.model
        )  # This is set per call to decide what model that was actually used
        self.system_message = system_message
        self.options = {"temperature": temperature}
        self.messages = messages or [{"role": "system", "content": self.system_message}]
        self.max_length_answer = max_length_answer
        self.chat = chat

        if not chosen_backend:
            chosen_backend = self.get_least_conn_server()
        self.chosen_backend = chosen_backend


        headers = {
            "Authorization": f"Basic {self.get_credentials()}",
            "X-Chosen-Backend": self.chosen_backend,
        }
        self.host_url = os.getenv("LLM_API_URL").rstrip("/api/chat/")
        self.host_url = 'http://192.168.1.12:3300' #! Change back when possible
        self.client: Client = Client(host=self.host_url, headers=headers, timeout=120)
        self.async_client: AsyncClient = AsyncClient()

    def get_credentials(self):
        # Initialize the client with the host and default headers
        credentials = f"{os.getenv('LLM_API_USER')}:{os.getenv('LLM_API_PWD_LASSE')}"
        return base64.b64encode(credentials.encode()).decode()

    def get_model(self, model_alias):

        models = {
            "standard": "LLM_MODEL",
            "small": "LLM_MODEL_SMALL",
            "vision": "LLM_MODEL_VISION",
            "standard_64k": "LLM_MODEL_LARGE",
            "reasoning": "LLM_MODEL_REASONING",
            "tools": "LLM_MODEL_TOOLS",
        }
        model = os.getenv(models.get(model_alias, "LLM_MODEL"))
        self.model = model
        return model

    def count_tokens(self):
        num_tokens = 0
        for i in self.messages:
            for k, v in i.items():
                if k == "content":
                    if not isinstance(v, str):
                        v = str(v)
                    tokens = tokenizer.encode(v)
                    num_tokens += len(tokens)
        return int(num_tokens)

    def get_least_conn_server(self):
        try:
            response = requests.get("http://192.168.1.12:5000/least_conn")
            response.raise_for_status()
            # Extract the least connected server from the response
            least_conn_server = response.headers.get("X-Upstream-Address")
            return least_conn_server
        except requests.RequestException as e:
            print_red("Error getting least connected server:", e)
            return None

    def generate(
        self,
        query: str = None,
        user_input: str = None,
        context: str = None,
        stream: bool = False,
        tools: list = None,
        images: list = None,
        model: Optional[
            Literal["small", "standard", "vision", "reasoning", "tools"]
        ] = None,
        temperature: float = None,
        messages: list[dict] = None,
        format = None
    ):
        """
        Generate a response based on the provided query and context.
        Parameters:
        query (str): The query string from the user.
        user_input (str): Additional user input to be appended to the last message.
        context (str): Contextual information to be used in generating the response.
        stream (bool): Whether to stream the response.
        tools (list): List of tools to be used in generating the response.
        images (list): List of images to be included in the response.
        model (Optional[Literal["small", "standard", "vision", "tools"]]): The model type to be used.
        temperature (float): The temperature setting for the model.
        messages (list[dict]): List of previous messages in the conversation.
        format (Optional[BaseModel]): The format of the response.

        Returns:
        str: The generated response or an error message if an exception occurs.
        """
        print_yellow("GENERATE")
        # Prepare the model and temperature
        model = self.get_model(model) if model else self.model
        if model == self.get_model('tools'):
            stream = False
        temperature = temperature if temperature else self.options["temperature"]

        if messages:
            messages = [
                {"role": i["role"], "content": re.sub(r"\s*\n\s*", "\n", i["content"])}
                for i in messages
            ]
            message = messages.pop(-1)
            query = message["content"]
            self.messages = messages
        else:
            # Normalize whitespace and add the query to the messages
            query = re.sub(r"\s*\n\s*", "\n", query)
            message = {"role": "user", "content": query}

        # Handle images if any
        if images:
            message = self.prepare_images(images, message)
            model = self.get_model("vision")

        self.messages.append(message)

        # Prepare headers
        headers = {"Authorization": f"Basic {self.get_credentials()}"}
        if self.chosen_backend and model not in [self.get_model("vision"), self.get_model("tools"), self.get_model("reasoning")]: #TODO Maybe reasoning shouldn't be here.
            headers["X-Chosen-Backend"] = self.chosen_backend

        if model == self.get_model("small"):
            headers["X-Model-Type"] = "small"
        if model == self.get_model("tools"):
            headers["X-Model-Type"] = "tools"
        elif model == self.get_model("reasoning"):
            headers["X-Model-Type"] = "reasoning"

        # Prepare options
        options = Options(**self.options)
        options.temperature = temperature

        #TODO This is a bit of a hack to get the reasoning model to work. It should be handled better.
        # # Adjust the options for long messages
        # if self.chat or len(self.messages) > 15000 and model != self.get_model("tools"):
        #     num_tokens = self.count_tokens()
        #     if num_tokens > 8000:
        #         model = self.get_model("standard_64k")
        #         print_purple("Switching to large model")
        #         headers["X-Model-Type"] = "large"

        # Call the client.chat method
        try:
            self.call_model = model
            self.client: Client = Client(host=self.host_url, headers=headers, timeout=300) #!
            #print_rainbow(self.client._client.__dict__)
            print_yellow("Model used in call:", model)
            # if headers:
            #     self.client.headers.update(headers)

            response = self.client.chat(
                model=model,
                messages=self.messages,
                tools=tools,
                stream=stream,
                options=options,
                keep_alive=3600 * 24 * 7,
                format=format
            )

        except ResponseError as e:
            print_red("Error!")
            print(e)
            return "An error occurred."
        # print_rainbow(response.__dict__)
        # If user_input is provided, update the last message

        if user_input:
            if context:
                if len(context) > 2000:
                    context = self.make_summary(context)
                user_input = (
                    f"{user_input}\n\nUse the information below to answer the question.\n"
                    f'"""{context}"""\n[This is a summary of the context provided in the original message.]'
                )
                system_message_info = "\nSometimes some of the messages in the chat history are summarised, then that is clearly indicated in the message."
                if system_message_info not in self.messages[0]["content"]:
                    self.messages[0]["content"] += system_message_info
            self.messages[-1] = {"role": "user", "content": user_input}

        # self.chosen_backend = self.client.last_response.headers.get("X-Chosen-Backend")

        # Handle streaming response
        if stream:
            return self.read_stream(response)
        else:
            # Process the response
            if isinstance(response, ChatResponse):
                result = response.message.content.strip('"')
                if '</think>' in result:
                    result = result.split('</think>')[-1]
                self.messages.append(
                    {"role": "assistant", "content": result.strip('"')}
                )
                if tools and not response.message.get("tool_calls"):
                    print_yellow("No tool calls in response".upper())
                if not self.chat:
                    self.messages = [self.messages[0]]
                return response.message
            else:
                print_red("Unexpected response type")
                return "An error occurred."

    def make_summary(self, text):
        # Implement your summary logic using self.client.chat()
        summary_message = {
            "role": "user",
            "content": f'Summarize the text below:\n"""{text}"""\nRemember to be concise and detailed. Answer in English.',
        }
        messages = [
            {
                "role": "system",
                "content": "You are summarizing a text. Make it detailed and concise. Answer ONLY with the summary. Don't add any new information.",
            },
            summary_message,
        ]
        try:
            response = self.client.chat(
                model=self.get_model("small"),
                messages=messages,
                options=Options(temperature=0.01),
                keep_alive=3600 * 24 * 7,
            )
            summary = response.message.content.strip()
            print_blue("Summary:", summary)
            return summary
        except ResponseError as e:
            print_red("Error generating summary:", e)
            return "Summary generation failed."

    def read_stream(self, response):
        """
        Yields tuples of (chunk_type, text). The first tuple is ('thinking', ...)
        if in_thinking is True and stops at </think>. After that, yields ('normal', ...)
        for the rest of the text.
        """
        thinking_buffer = ""
        in_thinking = self.call_model == self.get_model("reasoning")
        first_chunk = True
        prev_content = None

        for chunk in response:
            if not chunk:
                continue
            content = chunk.message.content

            # Remove leading quote if it's the first chunk
            if first_chunk and content.startswith('"'):
                content = content[1:]
            first_chunk = False

            if in_thinking:
                thinking_buffer += content
                if "</think>" in thinking_buffer:
                    end_idx = thinking_buffer.index("</think>") + len("</think>")
                    yield ("thinking", thinking_buffer[:end_idx])
                    remaining = thinking_buffer[end_idx:].strip('"')
                    if chunk.done and remaining:
                        yield ("normal", remaining)
                        break
                    else:
                        prev_content = remaining
                    in_thinking = False
            else:
                if prev_content:
                    yield ("normal", prev_content)
                prev_content = content

            if chunk.done:
                if prev_content and prev_content.endswith('"'):
                    prev_content = prev_content[:-1]
                if prev_content:
                    yield ("normal", prev_content)
                break

        self.messages.append({"role": "assistant", "content": ""})

    async def async_generate(
        self,
        query: str = None,
        user_input: str = None,
        context: str = None,
        stream: bool = False,
        tools: list = None,
        images: list = None,
        model: Optional[Literal["small", "standard", "vision"]] = None,
        temperature: float = None,
    ):
        """
        Asynchronously generates a response based on the provided query and other parameters.

        Args:
            query (str, optional): The query string to generate a response for.
            user_input (str, optional): Additional user input to be included in the response.
            context (str, optional): Context information to be used in generating the response.
            stream (bool, optional): Whether to stream the response. Defaults to False.
            tools (list, optional): List of tools to be used in generating the response. Will set the model to 'tools'.
            images (list, optional): List of images to be included in the response.
            model (Optional[Literal["small", "standard", "vision", "tools"]], optional): The model to be used for generating the response.
            temperature (float, optional): The temperature setting for the model.

        Returns:
            str: The generated response or an error message if an exception occurs.

        Raises:
            ResponseError: If an error occurs during the response generation.

        Notes:
            - The function prepares the model and temperature settings.
            - It normalizes whitespace in the query and handles images if provided.
            - It prepares headers and options for the request.
            - It adjusts options for long messages and calls the async client's chat method.
            - If user_input is provided, it updates the last message.
            - It updates the chosen backend based on the response headers.
            - It handles streaming responses and processes the response accordingly.
            - It's not neccecary to set model to 'tools' if you provide tools as an argument.
        """
        print_yellow("ASYNC GENERATE")
        # Normaliz e whitespace and add the query to the messages
        query = re.sub(r"\s*\n\s*", "\n", query)
        message = {"role": "user", "content": query}
        self.messages.append(message)

        # Prepare the model and temperature
        model = self.get_model(model) if model else self.model
        temperature = temperature if temperature else self.options["temperature"]

        # Prepare options
        options = Options(**self.options)
        options.temperature = temperature

        # Prepare headers
        headers = {}

        # Set model depending on the input
        if images:
            message = self.prepare_images(images, message)
            model = self.get_model("vision")
        elif tools:
            model = self.get_model("tools")
            headers["X-Model-Type"] = "tools"
            tools = [Tool(**tool) if isinstance(tool, dict) else tool for tool in tools]
        elif self.chosen_backend and model not in [self.get_model("vision"), self.get_model("tools"), self.get_model("reasoning")]:
            headers["X-Chosen-Backend"] = self.chosen_backend
        elif model == self.get_model("small"):
            headers["X-Model-Type"] = "small"

        # Adjust options for long messages
        if self.chat or len(self.messages) > 15000:
            num_tokens = self.count_tokens() + self.max_length_answer // 2
            if num_tokens > 8000 and model not in [
                self.get_model("vision"),
                self.get_model("tools"),
            ]:
                model = self.get_model("standard_64k")
                headers["X-Model-Type"] = "large"

        # Call the async client's chat method
        try:
            response = await self.async_client.chat(
                model=model,
                messages=self.messages,
                headers=headers,
                tools=tools,
                stream=stream,
                options=options,
                keep_alive=3600 * 24 * 7,
            )
        except ResponseError as e:
            print_red("Error!")
            print(e)
            return "An error occurred."

        # If user_input is provided, update the last message
        if user_input:
            if context:
                if len(context) > 2000:
                    context = self.make_summary(context)
                user_input = (
                    f"{user_input}\n\nUse the information below to answer the question.\n"
                    f'"""{context}"""\n[This is a summary of the context provided in the original message.]'
                )
                system_message_info = "\nSometimes some of the messages in the chat history are summarised, then that is clearly indicated in the message."
                if system_message_info not in self.messages[0]["content"]:
                    self.messages[0]["content"] += system_message_info
            self.messages[-1] = {"role": "user", "content": user_input}

        print_red(self.async_client.last_response.headers.get("X-Chosen-Backend", "No backend"))
        # Update chosen_backend
        if model not in [self.get_model("vision"), self.get_model("tools"), self.get_model("reasoning")]:
            self.chosen_backend = self.async_client.last_response.headers.get(
                "X-Chosen-Backend"
            )

        # Handle streaming response
        if stream:
            return self.read_stream(response)
        else:
            # Process the response
            if isinstance(response, ChatResponse):
                result = response.message.content.strip('"')
                self.messages.append(
                    {"role": "assistant", "content": result.strip('"')}
                )
                if tools and not response.message.get("tool_calls"):
                    print_yellow("No tool calls in response".upper())
                if not self.chat:
                    self.messages = [self.messages[0]]
                return result
            else:
                print_red("Unexpected response type")
                return "An error occurred."

    def prepare_images(self, images, message):
        """
        Prepares a list of images by converting them to base64 encoded strings and adds them to the provided message dictionary.
        Args:
            images (list): A list of images, where each image can be a file path (str), a base64 encoded string (str), or bytes.
            message (dict): A dictionary to which the base64 encoded images will be added under the key "images".
        Returns:
            dict: The updated message dictionary with the base64 encoded images added under the key "images".
        Raises:
            ValueError: If an image is not a string or bytes.
        """
        import base64

        base64_images = []
        base64_pattern = re.compile(r"^[A-Za-z0-9+/]+={0,2}$")

        for image in images:
            if isinstance(image, str):
                if base64_pattern.match(image):
                    base64_images.append(image)
                else:
                    with open(image, "rb") as image_file:
                        base64_images.append(
                            base64.b64encode(image_file.read()).decode("utf-8")
                        )
            elif isinstance(image, bytes):
                base64_images.append(base64.b64encode(image).decode("utf-8"))
            else:
                print_red("Invalid image type")

            message["images"] = base64_images
            # Use the vision model

            return message


if __name__ == "__main__":

    llm = LLM()

    result = llm.generate(
        query="I want to add 2 and 2",
    )
    print(result.content)