import os import base64 import re from typing import Literal, Optional import requests import tiktoken from ollama import ( Client, AsyncClient, ResponseError, ChatResponse, Tool, Options, ) import env_manager from colorprinter.print_color import * env_manager.set_env() tokenizer = tiktoken.get_encoding("cl100k_base") class LLM: """ LLM class for interacting with an instance of Ollama. Attributes: model (str): The model to be used for response generation. system_message (str): The system message to be used in the chat. options (dict): Options for the model, such as temperature. messages (list): List of messages in the chat. max_length_answer (int): Maximum length of the generated answer. chat (bool): Whether the chat mode is enabled. chosen_backend (str): The chosen backend server for the API. client (Client): The client for synchronous API calls. async_client (AsyncClient): The client for asynchronous API calls. tools (list): List of tools to be used in generating the response. Methods: __init__(self, system_message, temperature, model, max_length_answer, messages, chat, chosen_backend): Initializes the LLM class with the provided parameters. get_model(self, model_alias): Retrieves the model name based on the provided alias. count_tokens(self): Counts the number of tokens in the messages. get_least_conn_server(self): Retrieves the least connected server from the backend. generate(self, query, user_input, context, stream, tools, images, model, temperature): Generates a response based on the provided query and options. make_summary(self, text): Generates a summary of the provided text. read_stream(self, response): Handles streaming responses. async_generate(self, query, user_input, context, stream, tools, images, model, temperature): Asynchronously generates a response based on the provided query and options. prepare_images(self, images, message): """ def __init__( self, system_message: str = "You are an assistant.", temperature: float = 0.01, model: Optional[ Literal["small", "standard", "vision", "reasoning", "tools"] ] = "standard", max_length_answer: int = 4096, messages: list[dict] = None, chat: bool = True, chosen_backend: str = None, tools: list = None, ) -> None: """ Initialize the assistant with the given parameters. Args: system_message (str): The initial system message for the assistant. Defaults to "You are an assistant.". temperature (float): The temperature setting for the model, affecting randomness. Defaults to 0.01. model (Optional[Literal["small", "standard", "vision", "reasoning"]]): The model type to use. Defaults to "standard". max_length_answer (int): The maximum length of the generated answer. Defaults to 4096. messages (list[dict], optional): A list of initial messages. Defaults to None. chat (bool): Whether the assistant is in chat mode. Defaults to True. chosen_backend (str, optional): The backend server to use. If not provided, the least connected server is chosen. Returns: None """ self.model = self.get_model(model) self.call_model = ( self.model ) # This is set per call to decide what model that was actually used self.system_message = system_message self.options = {"temperature": temperature} self.messages = messages or [{"role": "system", "content": self.system_message}] self.max_length_answer = max_length_answer self.chat = chat if not chosen_backend: chosen_backend = self.get_least_conn_server() self.chosen_backend = chosen_backend headers = { "Authorization": f"Basic {self.get_credentials()}", "X-Chosen-Backend": self.chosen_backend, } self.host_url = os.getenv("LLM_API_URL").rstrip("/api/chat/") self.host_url = 'http://192.168.1.12:3300' #! Change back when possible self.client: Client = Client(host=self.host_url, headers=headers, timeout=120) self.async_client: AsyncClient = AsyncClient() def get_credentials(self): # Initialize the client with the host and default headers credentials = f"{os.getenv('LLM_API_USER')}:{os.getenv('LLM_API_PWD_LASSE')}" return base64.b64encode(credentials.encode()).decode() def get_model(self, model_alias): models = { "standard": "LLM_MODEL", "small": "LLM_MODEL_SMALL", "vision": "LLM_MODEL_VISION", "standard_64k": "LLM_MODEL_LARGE", "reasoning": "LLM_MODEL_REASONING", "tools": "LLM_MODEL_TOOLS", } model = os.getenv(models.get(model_alias, "LLM_MODEL")) self.model = model return model def count_tokens(self): num_tokens = 0 for i in self.messages: for k, v in i.items(): if k == "content": if not isinstance(v, str): v = str(v) tokens = tokenizer.encode(v) num_tokens += len(tokens) return int(num_tokens) def get_least_conn_server(self): try: response = requests.get("http://192.168.1.12:5000/least_conn") response.raise_for_status() # Extract the least connected server from the response least_conn_server = response.headers.get("X-Upstream-Address") return least_conn_server except requests.RequestException as e: print_red("Error getting least connected server:", e) return None def generate( self, query: str = None, user_input: str = None, context: str = None, stream: bool = False, tools: list = None, images: list = None, model: Optional[ Literal["small", "standard", "vision", "reasoning", "tools"] ] = None, temperature: float = None, messages: list[dict] = None, format = None ): """ Generate a response based on the provided query and context. Parameters: query (str): The query string from the user. user_input (str): Additional user input to be appended to the last message. context (str): Contextual information to be used in generating the response. stream (bool): Whether to stream the response. tools (list): List of tools to be used in generating the response. images (list): List of images to be included in the response. model (Optional[Literal["small", "standard", "vision", "tools"]]): The model type to be used. temperature (float): The temperature setting for the model. messages (list[dict]): List of previous messages in the conversation. format (Optional[BaseModel]): The format of the response. Returns: str: The generated response or an error message if an exception occurs. """ print_yellow("GENERATE") # Prepare the model and temperature model = self.get_model(model) if model else self.model if model == self.get_model('tools'): stream = False temperature = temperature if temperature else self.options["temperature"] if messages: messages = [ {"role": i["role"], "content": re.sub(r"\s*\n\s*", "\n", i["content"])} for i in messages ] message = messages.pop(-1) query = message["content"] self.messages = messages else: # Normalize whitespace and add the query to the messages query = re.sub(r"\s*\n\s*", "\n", query) message = {"role": "user", "content": query} # Handle images if any if images: message = self.prepare_images(images, message) model = self.get_model("vision") self.messages.append(message) # Prepare headers headers = {"Authorization": f"Basic {self.get_credentials()}"} if self.chosen_backend and model not in [self.get_model("vision"), self.get_model("tools"), self.get_model("reasoning")]: #TODO Maybe reasoning shouldn't be here. headers["X-Chosen-Backend"] = self.chosen_backend if model == self.get_model("small"): headers["X-Model-Type"] = "small" if model == self.get_model("tools"): headers["X-Model-Type"] = "tools" elif model == self.get_model("reasoning"): headers["X-Model-Type"] = "reasoning" # Prepare options options = Options(**self.options) options.temperature = temperature #TODO This is a bit of a hack to get the reasoning model to work. It should be handled better. # # Adjust the options for long messages # if self.chat or len(self.messages) > 15000 and model != self.get_model("tools"): # num_tokens = self.count_tokens() # if num_tokens > 8000: # model = self.get_model("standard_64k") # print_purple("Switching to large model") # headers["X-Model-Type"] = "large" # Call the client.chat method try: self.call_model = model self.client: Client = Client(host=self.host_url, headers=headers, timeout=300) #! #print_rainbow(self.client._client.__dict__) print_yellow("Model used in call:", model) # if headers: # self.client.headers.update(headers) response = self.client.chat( model=model, messages=self.messages, tools=tools, stream=stream, options=options, keep_alive=3600 * 24 * 7, format=format ) except ResponseError as e: print_red("Error!") print(e) return "An error occurred." # print_rainbow(response.__dict__) # If user_input is provided, update the last message if user_input: if context: if len(context) > 2000: context = self.make_summary(context) user_input = ( f"{user_input}\n\nUse the information below to answer the question.\n" f'"""{context}"""\n[This is a summary of the context provided in the original message.]' ) system_message_info = "\nSometimes some of the messages in the chat history are summarised, then that is clearly indicated in the message." if system_message_info not in self.messages[0]["content"]: self.messages[0]["content"] += system_message_info self.messages[-1] = {"role": "user", "content": user_input} # self.chosen_backend = self.client.last_response.headers.get("X-Chosen-Backend") # Handle streaming response if stream: return self.read_stream(response) else: # Process the response if isinstance(response, ChatResponse): result = response.message.content.strip('"') if '' in result: result = result.split('')[-1] self.messages.append( {"role": "assistant", "content": result.strip('"')} ) if tools and not response.message.get("tool_calls"): print_yellow("No tool calls in response".upper()) if not self.chat: self.messages = [self.messages[0]] return response.message else: print_red("Unexpected response type") return "An error occurred." def make_summary(self, text): # Implement your summary logic using self.client.chat() summary_message = { "role": "user", "content": f'Summarize the text below:\n"""{text}"""\nRemember to be concise and detailed. Answer in English.', } messages = [ { "role": "system", "content": "You are summarizing a text. Make it detailed and concise. Answer ONLY with the summary. Don't add any new information.", }, summary_message, ] try: response = self.client.chat( model=self.get_model("small"), messages=messages, options=Options(temperature=0.01), keep_alive=3600 * 24 * 7, ) summary = response.message.content.strip() print_blue("Summary:", summary) return summary except ResponseError as e: print_red("Error generating summary:", e) return "Summary generation failed." def read_stream(self, response): """ Yields tuples of (chunk_type, text). The first tuple is ('thinking', ...) if in_thinking is True and stops at . After that, yields ('normal', ...) for the rest of the text. """ thinking_buffer = "" in_thinking = self.call_model == self.get_model("reasoning") first_chunk = True prev_content = None for chunk in response: if not chunk: continue content = chunk.message.content # Remove leading quote if it's the first chunk if first_chunk and content.startswith('"'): content = content[1:] first_chunk = False if in_thinking: thinking_buffer += content if "" in thinking_buffer: end_idx = thinking_buffer.index("") + len("") yield ("thinking", thinking_buffer[:end_idx]) remaining = thinking_buffer[end_idx:].strip('"') if chunk.done and remaining: yield ("normal", remaining) break else: prev_content = remaining in_thinking = False else: if prev_content: yield ("normal", prev_content) prev_content = content if chunk.done: if prev_content and prev_content.endswith('"'): prev_content = prev_content[:-1] if prev_content: yield ("normal", prev_content) break self.messages.append({"role": "assistant", "content": ""}) async def async_generate( self, query: str = None, user_input: str = None, context: str = None, stream: bool = False, tools: list = None, images: list = None, model: Optional[Literal["small", "standard", "vision"]] = None, temperature: float = None, ): """ Asynchronously generates a response based on the provided query and other parameters. Args: query (str, optional): The query string to generate a response for. user_input (str, optional): Additional user input to be included in the response. context (str, optional): Context information to be used in generating the response. stream (bool, optional): Whether to stream the response. Defaults to False. tools (list, optional): List of tools to be used in generating the response. Will set the model to 'tools'. images (list, optional): List of images to be included in the response. model (Optional[Literal["small", "standard", "vision", "tools"]], optional): The model to be used for generating the response. temperature (float, optional): The temperature setting for the model. Returns: str: The generated response or an error message if an exception occurs. Raises: ResponseError: If an error occurs during the response generation. Notes: - The function prepares the model and temperature settings. - It normalizes whitespace in the query and handles images if provided. - It prepares headers and options for the request. - It adjusts options for long messages and calls the async client's chat method. - If user_input is provided, it updates the last message. - It updates the chosen backend based on the response headers. - It handles streaming responses and processes the response accordingly. - It's not neccecary to set model to 'tools' if you provide tools as an argument. """ print_yellow("ASYNC GENERATE") # Normaliz e whitespace and add the query to the messages query = re.sub(r"\s*\n\s*", "\n", query) message = {"role": "user", "content": query} self.messages.append(message) # Prepare the model and temperature model = self.get_model(model) if model else self.model temperature = temperature if temperature else self.options["temperature"] # Prepare options options = Options(**self.options) options.temperature = temperature # Prepare headers headers = {} # Set model depending on the input if images: message = self.prepare_images(images, message) model = self.get_model("vision") elif tools: model = self.get_model("tools") headers["X-Model-Type"] = "tools" tools = [Tool(**tool) if isinstance(tool, dict) else tool for tool in tools] elif self.chosen_backend and model not in [self.get_model("vision"), self.get_model("tools"), self.get_model("reasoning")]: headers["X-Chosen-Backend"] = self.chosen_backend elif model == self.get_model("small"): headers["X-Model-Type"] = "small" # Adjust options for long messages if self.chat or len(self.messages) > 15000: num_tokens = self.count_tokens() + self.max_length_answer // 2 if num_tokens > 8000 and model not in [ self.get_model("vision"), self.get_model("tools"), ]: model = self.get_model("standard_64k") headers["X-Model-Type"] = "large" # Call the async client's chat method try: response = await self.async_client.chat( model=model, messages=self.messages, headers=headers, tools=tools, stream=stream, options=options, keep_alive=3600 * 24 * 7, ) except ResponseError as e: print_red("Error!") print(e) return "An error occurred." # If user_input is provided, update the last message if user_input: if context: if len(context) > 2000: context = self.make_summary(context) user_input = ( f"{user_input}\n\nUse the information below to answer the question.\n" f'"""{context}"""\n[This is a summary of the context provided in the original message.]' ) system_message_info = "\nSometimes some of the messages in the chat history are summarised, then that is clearly indicated in the message." if system_message_info not in self.messages[0]["content"]: self.messages[0]["content"] += system_message_info self.messages[-1] = {"role": "user", "content": user_input} print_red(self.async_client.last_response.headers.get("X-Chosen-Backend", "No backend")) # Update chosen_backend if model not in [self.get_model("vision"), self.get_model("tools"), self.get_model("reasoning")]: self.chosen_backend = self.async_client.last_response.headers.get( "X-Chosen-Backend" ) # Handle streaming response if stream: return self.read_stream(response) else: # Process the response if isinstance(response, ChatResponse): result = response.message.content.strip('"') self.messages.append( {"role": "assistant", "content": result.strip('"')} ) if tools and not response.message.get("tool_calls"): print_yellow("No tool calls in response".upper()) if not self.chat: self.messages = [self.messages[0]] return result else: print_red("Unexpected response type") return "An error occurred." def prepare_images(self, images, message): """ Prepares a list of images by converting them to base64 encoded strings and adds them to the provided message dictionary. Args: images (list): A list of images, where each image can be a file path (str), a base64 encoded string (str), or bytes. message (dict): A dictionary to which the base64 encoded images will be added under the key "images". Returns: dict: The updated message dictionary with the base64 encoded images added under the key "images". Raises: ValueError: If an image is not a string or bytes. """ import base64 base64_images = [] base64_pattern = re.compile(r"^[A-Za-z0-9+/]+={0,2}$") for image in images: if isinstance(image, str): if base64_pattern.match(image): base64_images.append(image) else: with open(image, "rb") as image_file: base64_images.append( base64.b64encode(image_file.read()).decode("utf-8") ) elif isinstance(image, bytes): base64_images.append(base64.b64encode(image).decode("utf-8")) else: print_red("Invalid image type") message["images"] = base64_images # Use the vision model return message if __name__ == "__main__": llm = LLM() result = llm.generate( query="I want to add 2 and 2", ) print(result.content)