import os import base64 import re import json from typing import Any, Callable, Iterator, Literal, Mapping, Optional, Sequence, Union import tiktoken from ollama import Client, AsyncClient, ResponseError, ChatResponse, Message, Tool, Options from ollama._types import JsonSchemaValue, ChatRequest import env_manager from colorprinter.print_color import * env_manager.set_env() tokenizer = tiktoken.get_encoding("cl100k_base") # Define a base class for common functionality class BaseClient: def chat( self, model: str = '', messages: Optional[Sequence[Union[Mapping[str, Any], Message]]] = None, *, tools: Optional[Sequence[Union[Mapping[str, Any], Tool, Callable]]] = None, stream: bool = False, format: Optional[Union[Literal['', 'json'], JsonSchemaValue]] = None, options: Optional[Union[Mapping[str, Any], Options]] = None, keep_alive: Optional[Union[float, str]] = None, ) -> Union[ChatResponse, Iterator[ChatResponse]]: return self._request( ChatResponse, 'POST', '/api/chat', json=ChatRequest( model=model, messages=[message for message in messages or []], tools=[tool for tool in tools or []], stream=stream, format=format, options=options, keep_alive=keep_alive, ).model_dump(exclude_none=True), stream=stream, ) # Define your custom MyAsyncClient class class MyAsyncClient(AsyncClient, BaseClient): async def _request(self, response_type, method, path, headers=None, **kwargs): # Merge default headers with per-call headers all_headers = {**self._client.headers, **(headers or {})} # Handle streaming separately if kwargs.get('stream'): kwargs.pop('stream') async with self._client.stream(method, path, headers=all_headers, **kwargs) as response: self.last_response = response # Store the response object if response.status_code >= 400: await response.aread() raise ResponseError(response.text, response.status_code) return self._stream(response_type, response) else: # Make the HTTP request with the combined headers kwargs.pop('stream') response = await self._request_raw(method, path, headers=all_headers, **kwargs) self.last_response = response # Store the response object if response.status_code >= 400: raise ResponseError(response.text, response.status_code) return response_type.model_validate_json(response.content) async def chat( self, model: str = '', messages: Optional[Sequence[Union[Mapping[str, Any], Message]]] = None, *, tools: Optional[Sequence[Union[Mapping[str, Any], Tool, Callable]]] = None, stream: bool = False, format: Optional[Union[Literal['', 'json'], JsonSchemaValue]] = None, options: Optional[Union[Mapping[str, Any], Options]] = None, keep_alive: Optional[Union[float, str]] = None, ) -> Union[ChatResponse, Iterator[ChatResponse]]: return await self._request( ChatResponse, 'POST', '/api/chat', json=ChatRequest( model=model, messages=[message for message in messages or []], tools=[tool for tool in tools or []], stream=stream, format=format, options=options, keep_alive=keep_alive, ).model_dump(exclude_none=True), stream=stream, ) # Define your custom MyClient class class MyClient(Client, BaseClient): def _request(self, response_type, method, path, headers=None, **kwargs): # Merge default headers with per-call headers all_headers = {**self._client.headers, **(headers or {})} # Handle streaming separately if kwargs.get('stream'): kwargs.pop('stream') with self._client.stream(method, path, headers=all_headers, **kwargs) as response: self.last_response = response # Store the response object if response.status_code >= 400: raise ResponseError(response.text, response.status_code) return self._stream(response_type, response) else: # Make the HTTP request with the combined headers kwargs.pop('stream') response = self._request_raw(method, path, headers=all_headers, **kwargs) self.last_response = response # Store the response object if response.status_code >= 400: raise ResponseError(response.text, response.status_code) return response_type.model_validate_json(response.content) class LLM: """ LLM class for interacting with a language model. """ def __init__( self, system_message="You are an assistant.", temperature=0.01, model: Optional[Literal["small", "standard", "vision"]] = "standard", max_length_answer=4096, messages=None, chat=True, chosen_backend=None, ) -> None: self.model = self.get_model(model) self.system_message = system_message self.options = {"temperature": temperature} self.messages = messages or [{"role": "system", "content": self.system_message}] self.max_length_answer = max_length_answer self.chat = chat self.chosen_backend = chosen_backend # Initialize the client with the host and default headers credentials = f"{os.getenv('LLM_API_USER')}:{os.getenv('LLM_API_PWD_LASSE')}" encoded_credentials = base64.b64encode(credentials.encode()).decode() default_headers = { "Authorization": f"Basic {encoded_credentials}", } host_url = os.getenv("LLM_API_URL").rstrip("/api/chat/") self.client = MyClient(host=host_url, headers=default_headers) self.async_client = MyAsyncClient(host=host_url, headers=default_headers) def get_model(self, model_alias): models = { "standard": "LLM_MODEL", "small": "LLM_MODEL_SMALL", "vision": "LLM_MODEL_VISION", "standard_64k": "LLM_MODEL_64K", } return os.getenv(models.get(model_alias, "LLM_MODEL")) def count_tokens(self): num_tokens = 0 for i in self.messages: for k, v in i.items(): if k == "content": if not isinstance(v, str): v = str(v) tokens = tokenizer.encode(v) num_tokens += len(tokens) return int(num_tokens) def generate( self, query: str = None, user_input: str = None, context: str = None, stream: bool = False, tools: list = None, function_call: dict = None, images: list = None, model: Optional[Literal["small", "standard", "vision"]] = None, temperature: float = None, ): """ Generates a response from the language model based on the provided inputs. """ # Prepare the model and temperature model = self.get_model(model) if model else self.model temperature = temperature if temperature else self.options["temperature"] # Normalize whitespace and add the query to the messages query = re.sub(r"\s*\n\s*", "\n", query) message = {"role": "user", "content": query} # Handle images if any if images: import base64 base64_images = [] base64_pattern = re.compile(r"^[A-Za-z0-9+/]+={0,2}$") for image in images: if isinstance(image, str): if base64_pattern.match(image): base64_images.append(image) else: with open(image, "rb") as image_file: base64_images.append( base64.b64encode(image_file.read()).decode("utf-8") ) elif isinstance(image, bytes): base64_images.append(base64.b64encode(image).decode("utf-8")) else: print_red("Invalid image type") message["images"] = base64_images # Use the vision model model = self.get_model("vision") self.messages.append(message) # Prepare headers headers = {} if self.chosen_backend: headers["X-Chosen-Backend"] = self.chosen_backend if model == self.get_model("small"): headers["X-Model-Type"] = "small" # Prepare options options = Options(**self.options) options.temperature = temperature # Prepare tools if any if tools: tools = [ Tool(**tool) if isinstance(tool, dict) else tool for tool in tools ] # Adjust the options for long messages if self.chat or len(self.messages) > 15000: num_tokens = self.count_tokens() + self.max_length_answer // 2 if num_tokens > 8000: model = self.get_model("standard_64k") headers["X-Model-Type"] = "large" # Call the client.chat method try: response = self.client.chat( model=model, messages=self.messages, headers=headers, tools=tools, stream=stream, options=options, keep_alive=3600 * 24 * 7, ) except ResponseError as e: print_red("Error!") print(e) return "An error occurred." # If user_input is provided, update the last message if user_input: if context: if len(context) > 2000: context = self.make_summary(context) user_input = ( f"{user_input}\n\nUse the information below to answer the question.\n" f'"""{context}"""\n[This is a summary of the context provided in the original message.]' ) system_message_info = "\nSometimes some of the messages in the chat history are summarised, then that is clearly indicated in the message." if system_message_info not in self.messages[0]["content"]: self.messages[0]["content"] += system_message_info self.messages[-1] = {"role": "user", "content": user_input} self.chosen_backend = self.client.last_response.headers.get("X-Chosen-Backend") # Handle streaming response if stream: return self.read_stream(response) else: # Process the response if isinstance(response, ChatResponse): result = response.message.content.strip('"') self.messages.append({"role": "assistant", "content": result.strip('"')}) if tools and not response.message.get("tool_calls"): print_yellow("No tool calls in response".upper()) if not self.chat: self.messages = [self.messages[0]] return result else: print_red("Unexpected response type") return "An error occurred." def make_summary(self, text): # Implement your summary logic using self.client.chat() summary_message = { "role": "user", "content": f'Summarize the text below:\n"""{text}"""\nRemember to be concise and detailed. Answer in English.', } messages = [ {"role": "system", "content": "You are summarizing a text. Make it detailed and concise. Answer ONLY with the summary. Don't add any new information."}, summary_message, ] try: response = self.client.chat( model=self.get_model("small"), messages=messages, options=Options(temperature=0.01), keep_alive=3600 * 24 * 7, ) summary = response.message.content.strip() print_blue("Summary:", summary) return summary except ResponseError as e: print_red("Error generating summary:", e) return "Summary generation failed." def read_stream(self, response): # Implement streaming response handling if needed buffer = "" message = "" first_chunk = True prev_content = None for chunk in response: if chunk: content = chunk.message.content if first_chunk and content.startswith('"'): content = content[1:] first_chunk = False if chunk.done: if prev_content and prev_content.endswith('"'): prev_content = prev_content[:-1] if prev_content: yield prev_content break else: if prev_content: yield prev_content prev_content = content self.messages.append({"role": "assistant", "content": message.strip('"')}) async def async_generate( self, query: str = None, user_input: str = None, context: str = None, stream: bool = False, tools: list = None, function_call: dict = None, images: list = None, model: Optional[Literal["small", "standard", "vision"]] = None, temperature: float = None, ): """ Asynchronous method to generate a response from the language model. """ # Prepare the model and temperature model = self.get_model(model) if model else self.model temperature = temperature if temperature else self.options["temperature"] # Normalize whitespace and add the query to the messages query = re.sub(r"\s*\n\s*", "\n", query) message = {"role": "user", "content": query} # Handle images if any if images: # (Image handling code as in the generate method) ... self.messages.append(message) # Prepare headers headers = {} if self.chosen_backend: headers["X-Chosen-Backend"] = self.chosen_backend if model == self.get_model("small"): headers["X-Model-Type"] = "small" # Prepare options options = Options(**self.options) options.temperature = temperature # Prepare tools if any if tools: tools = [ Tool(**tool) if isinstance(tool, dict) else tool for tool in tools ] # Adjust options for long messages # (Adjustments as needed) ... # Call the async client's chat method try: response = await self.async_client.chat( model=model, messages=self.messages, tools=tools, stream=stream, options=options, keep_alive=3600 * 24 * 7, ) except ResponseError as e: print_red("Error!") print(e) return "An error occurred." # Process the response if isinstance(response, ChatResponse): result = response.message.content.strip('"') self.messages.append({"role": "assistant", "content": result.strip('"')}) return result else: print_red("Unexpected response type") return "An error occurred." # Usage example if __name__ == "__main__": import asyncio llm = LLM() async def main(): result = await llm.async_generate(query="Hello, how are you?") print(result) asyncio.run(main())