first commit

7 months ago · 6c807c8744
commit 6c807c8744
7 changed files with 729 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,36 @@
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Environment variables
+.env
+.venv
+.envrc
+
+# IDE specific files
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# OS specific files
+.DS_Store
+Thumbs.db
--- a/0
+++ b/0
--- a/README.md
+++ b/README.md
@ -0,0 +1,109 @@
+# _llm
+
+A Python package for interacting with LLM models through Ollama, supporting both remote API and local Ollama instances.
+
+## Installation
+
+Install directly from GitHub:
+
+```bash
+pip install git+https://github.com/lasseedfast/_llm.git
+```
+
+Or clone and install for development:
+
+```bash
+git clone https://github.com/lasseedfast/_llm.git
+cd _llm
+pip install -e .
+```
+
+## Dependencies
+
+This package requires:
+
+- env_manager: `pip install git+https://github.com/lasseedfast/env_manager.git`
+- colorprinter: `pip install git+https://github.com/lasseedfast/colorprinter.git`
+- ollama: For local model inference
+- tiktoken: For token counting
+- requests: For API communication
+
+## Environment Variables
+
+The package requires several environment variables to be set:
+
+- `LLM_API_URL`: URL of the Ollama API
+- `LLM_API_USER`: Username for API authentication
+- `LLM_API_PWD_LASSE`: Password for API authentication
+- `LLM_MODEL`: Standard model name
+- `LLM_MODEL_SMALL`: Small model name
+- `LLM_MODEL_VISION`: Vision model name
+- `LLM_MODEL_LARGE`: Large context model name
+- `LLM_MODEL_REASONING`: Reasoning model name
+- `LLM_MODEL_TOOLS`: Tools model name
+
+These can be set in a `.env` file in your project directory or in the ArangoDB environment document in the div database.
+
+## Basic Usage
+
+```python
+from _llm import LLM
+
+# Initialize the LLM
+llm = LLM()
+
+# Generate a response
+result = llm.generate(
+    query="I want to add 2 and 2",
+)
+print(result.content)
+```
+
+## Advanced Usage
+
+### Working with Images
+
+```python
+from _llm import LLM
+
+llm = LLM()
+response = llm.generate(
+    query="What's in this image?",
+    images=["path/to/image.jpg"],
+    model="vision"
+)
+```
+
+### Streaming Responses
+
+```python
+from _llm import LLM
+
+llm = LLM()
+for chunk_type, chunk in llm.generate(
+    query="Write a paragraph about AI",
+    stream=True
+):
+    print(f"{chunk_type}: {chunk}")
+```
+
+### Using Async API
+
+```python
+import asyncio
+from _llm import LLM
+
+async def main():
+    llm = LLM()
+    response = await llm.async_generate(
+        query="What is machine learning?",
+        model="standard"
+    )
+    print(response)
+
+asyncio.run(main())
+```
+
+## License
+
+MIT
--- a/_llm/init.py
+++ b/_llm/init.py
@ -0,0 +1,4 @@
+from .llm import LLM, remove_thinking
+
+__version__ = "0.1.0"
+__all__ = ["LLM", "remove_thinking"]
--- a/_llm/main.py
+++ b/_llm/main.py
--- a/_llm/llm.py
+++ b/_llm/llm.py
@ -0,0 +1,554 @@
+import os
+import base64
+import re
+import traceback
+from typing import Literal, Optional
+import requests
+import tiktoken
+from ollama import (
+    Client,
+    AsyncClient,
+    ResponseError,
+    ChatResponse,
+    Tool,
+    Options,
+)
+
+import env_manager
+from colorprinter.print_color import *
+
+env_manager.set_env()
+
+print(os.environ)
+tokenizer = tiktoken.get_encoding("cl100k_base")
+
+
+class LLM:
+    """
+    LLM class for interacting with an instance of Ollama.
+
+    Attributes:
+        model (str): The model to be used for response generation.
+        system_message (str): The system message to be used in the chat.
+        options (dict): Options for the model, such as temperature.
+        messages (list): List of messages in the chat.
+        max_length_answer (int): Maximum length of the generated answer.
+        chat (bool): Whether the chat mode is enabled.
+        chosen_backend (str): The chosen backend server for the API.
+        client (Client): The client for synchronous API calls.
+        async_client (AsyncClient): The client for asynchronous API calls.
+        tools (list): List of tools to be used in generating the response.
+
+    Note:
+        Most logic for message preparation, header building, and API calls is handled by private helper methods.
+        The main public methods are:
+            - generate: Synchronous generation (see method docstring)
+            - async_generate: Asynchronous generation (see method docstring)
+            - make_summary: Generate a summary of a text
+            - read_stream: Handle streaming responses
+            - prepare_images: Prepare images for vision models
+    """
+
+    def __init__(
+        self,
+        system_message: str = "You are an assistant.",
+        temperature: float = 0.01,
+        model: Optional[
+            Literal["small", "standard", "vision", "reasoning", "tools"]
+        ] = "standard",
+        max_length_answer: int = 4096,
+        messages: list[dict] = None,
+        chat: bool = True,
+        chosen_backend: str = None,
+        tools: list = None,
+    ) -> None:
+        """
+        Initialize the assistant with the given parameters.
+
+        Args:
+            system_message (str): The initial system message for the assistant. Defaults to "You are an assistant.".
+            temperature (float): The temperature setting for the model, affecting randomness. Defaults to 0.01.
+            model (Optional[Literal["small", "standard", "vision", "reasoning"]]): The model type to use. Defaults to "standard".
+            max_length_answer (int): The maximum length of the generated answer. Defaults to 4096.
+            messages (list[dict], optional): A list of initial messages. Defaults to None.
+            chat (bool): Whether the assistant is in chat mode. Defaults to True.
+            chosen_backend (str, optional): The backend server to use. If not provided, the least connected server is chosen.
+
+        Returns:
+            None
+        """
+
+        self.model = self.get_model(model)
+        self.call_model = (
+            self.model
+        )  # This is set per call to decide what model that was actually used
+        self.system_message = system_message
+        self.options = {"temperature": temperature}
+        self.messages = messages or [{"role": "system", "content": self.system_message}]
+        self.max_length_answer = max_length_answer
+        self.chat = chat
+
+        self.chosen_backend = chosen_backend
+
+
+        headers = {
+            "Authorization": f"Basic {self.get_credentials()}",
+        }
+
+        if self.chosen_backend:
+            headers["X-Chosen-Backend"] = self.chosen_backend
+            
+        self.host_url = os.getenv("LLM_API_URL").rstrip("/api/chat/")
+        self.client: Client = Client(host=self.host_url, headers=headers, timeout=120)           
+        self.async_client: AsyncClient = AsyncClient()
+
+    def get_credentials(self):
+        # Initialize the client with the host and default headers
+        credentials = f"{os.getenv('LLM_API_USER')}:{os.getenv('LLM_API_PWD_LASSE')}"
+        return base64.b64encode(credentials.encode()).decode()
+    
+    def get_model(self, model_alias):
+        models = {
+            "standard": "LLM_MODEL",
+            "small": "LLM_MODEL_SMALL",
+            "vision": "LLM_MODEL_VISION",
+            "standard_64k": "LLM_MODEL_LARGE",
+            "reasoning": "LLM_MODEL_REASONING",
+            "tools": "LLM_MODEL_TOOLS",
+        }
+        model = os.getenv(models.get(model_alias, "LLM_MODEL"))
+        print_purple(f"Using model: {model}")
+        return model
+
+    def count_tokens(self):
+        num_tokens = 0
+        for i in self.messages:
+            for k, v in i.items():
+                if k == "content":
+                    if not isinstance(v, str):
+                        v = str(v)
+                    tokens = tokenizer.encode(v)
+                    num_tokens += len(tokens)
+        return int(num_tokens)
+
+    def _prepare_messages_and_model(self, query, user_input, context, messages, images, model):
+        """Prepare messages and select the appropriate model, handling images if present."""
+        print_red('model', model)
+        if messages:
+            messages = [
+                {"role": i["role"], "content": re.sub(r"\s*\n\s*", "\n", i["content"])}
+                for i in messages
+            ]
+            message = messages.pop(-1)
+            query = message["content"]
+            self.messages = messages
+
+        else:
+            query = re.sub(r"\s*\n\s*", "\n", query)
+            message = {"role": "user", "content": query}
+        
+        if images:
+            message = self.prepare_images(images, message)
+            model = self.get_model("vision")
+        else:
+            model = self.get_model(model)
+            
+        self.messages.append(message)
+        print_red('return model',model)
+        return model
+
+    def _build_headers(self, model, tools, think):
+        """Build HTTP headers for API requests, including auth and backend/model info."""
+        headers = {"Authorization": f"Basic {self.get_credentials()}"}
+        if self.chosen_backend and model not in [self.get_model("vision"), self.get_model("tools"), self.get_model("reasoning")]:
+            headers["X-Chosen-Backend"] = self.chosen_backend
+        if model == self.get_model("small"):
+            headers["X-Model-Type"] = "small"
+        if model == self.get_model("tools"):
+            headers["X-Model-Type"] = "tools"
+        if think and model and any([m in model for m in ['qwen3', 'deepseek']]):
+            self.messages[-1]['content'] = f"/think\n{self.messages[-1]['content']}"
+        elif model and any([m in model for m in ['qwen3', 'deepseek']]):
+            self.messages[-1]['content'] = f"/no_think\n{self.messages[-1]['content']}"
+        return headers
+
+    def _get_options(self, temperature):
+        """Build model options, setting temperature and other parameters."""
+        options = Options(**self.options)
+        options.temperature = temperature if temperature is not None else self.options["temperature"]
+        return options
+
+    def _call_remote_api(self, model, tools, stream, options, format, headers):
+        """Call the remote Ollama API synchronously."""
+        self.call_model = model
+        self.client: Client = Client(host=self.host_url, headers=headers, timeout=300)
+        print_yellow(f"🤖 Generating using {model} (remote)...")
+        response = self.client.chat(
+            model=model,
+            messages=self.messages,
+            tools=tools,
+            stream=stream,
+            options=options,
+            keep_alive=3600 * 24 * 7,
+            format=format
+        )
+        return response
+
+    async def _call_remote_api_async(self, model, tools, stream, options, format, headers):
+        """Call the remote Ollama API asynchronously."""
+        print_yellow(f"🤖 Generating using {model} (remote, async)...")
+        response = await self.async_client.chat(
+            model=model,
+            messages=self.messages,
+            headers=headers,
+            tools=tools,
+            stream=stream,
+            options=options,
+            keep_alive=3600 * 24 * 7,
+        )
+        return response
+
+    def _call_local_ollama(self, model, stream, temperature):
+        """Call the local Ollama instance synchronously."""
+        import ollama
+        print_yellow(f"🤖 Generating using {model} (local)...")
+        options = {"temperature": temperature}
+        if stream:
+            response_stream = ollama.chat(
+                model=model,
+                messages=self.messages,
+                options=options,
+                stream=True
+            )
+            def local_stream_adapter():
+                for chunk in response_stream:
+                    yield type('OllamaResponse', (), {
+                        'message': type('Message', (), {'content': chunk['message']['content']}),
+                        'done': chunk.get('done', False)
+                    })
+            return self.read_stream(local_stream_adapter())
+        else:
+            response = ollama.chat(
+                model=model,
+                messages=self.messages,
+                options=options
+            )
+            result = response['message']['content']
+            response_obj = type('LocalChatResponse', (), {
+                'message': type('Message', (), {
+                    'content': result,
+                    'get': lambda x: None
+                })
+            })
+            if '</think>' in result:
+                result = result.split('</think>')[-1].strip()
+                response_obj.message.content = result
+            self.messages.append({"role": "assistant", "content": result})
+            if not self.chat:
+                self.messages = [self.messages[0]]
+            return response_obj.message
+
+    async def _call_local_ollama_async(self, model, stream, temperature):
+        """Call the local Ollama instance asynchronously (using a thread pool)."""
+        import ollama
+        import asyncio
+        print_yellow(f"🤖 Generating using {model} (local, async)...")
+        options = {"temperature": temperature}
+        loop = asyncio.get_event_loop()
+        if stream:
+            def run_stream():
+                return ollama.chat(
+                    model=model,
+                    messages=self.messages,
+                    options=options,
+                    stream=True
+                )
+            response_stream = await loop.run_in_executor(None, run_stream)
+            async def local_stream_adapter():
+                for chunk in response_stream:
+                    yield type('OllamaResponse', (), {
+                        'message': type('Message', (), {'content': chunk['message']['content']}),
+                        'done': chunk.get('done', False)
+                    })
+            return local_stream_adapter()
+        else:
+            def run_chat():
+                return ollama.chat(
+                    model=model,
+                    messages=self.messages,
+                    options=options
+                )
+            response_dict = await loop.run_in_executor(None, run_chat)
+            result = response_dict['message']['content']
+            self.messages.append({"role": "assistant", "content": result})
+            if not self.chat:
+                self.messages = [self.messages[0]]
+            return result
+
+    def generate(
+        self,
+        query: str = None,
+        user_input: str = None,
+        context: str = None,
+        stream: bool = False,
+        tools: list = None,
+        images: list = None,
+        model: Optional[
+            Literal["small", "standard", "vision", "reasoning", "tools"]
+        ] = 'standard',
+        temperature: float = None,
+        messages: list[dict] = None,
+        format = None,
+        think = False,
+        force_local: bool = False
+    ):
+        """
+        Generate a response based on the provided query and context.
+        """
+        model = self._prepare_messages_and_model(query, user_input, context, messages, images, model)
+        print(f"[generate] model after _prepare_messages_and_model: {model}")
+        temperature = temperature if temperature else self.options["temperature"]
+        if not force_local:
+            try:
+                headers = self._build_headers(model, tools, think)
+                options = self._get_options(temperature)
+                response = self._call_remote_api(model, tools, stream, options, format, headers)
+                if stream:
+                    return self.read_stream(response)
+                else:
+                    if isinstance(response, ChatResponse):
+                        result = response.message.content.strip('"')
+                        if '</think>' in result:
+                            result = result.split('</think>')[-1]
+                        self.messages.append({"role": "assistant", "content": result.strip('"')})
+                        if tools and not response.message.get("tool_calls"):
+                            pass
+                        if not self.chat:
+                            self.messages = [self.messages[0]]
+                        if not think:
+                            response.message.content = remove_thinking(response.message.content)
+                        return response.message
+                    else:
+                        return "An error occurred."
+            except Exception as e:
+                traceback.print_exc()
+        try:
+            return self._call_local_ollama(model, stream, temperature)
+        except Exception as e:
+            traceback.print_exc()
+            return "Both remote API and local Ollama failed. An error occurred."
+
+    async def async_generate(
+        self,
+        query: str = None,
+        user_input: str = None,
+        context: str = None,
+        stream: bool = False,
+        tools: list = None,
+        images: list = None,
+        model: Optional[
+            Literal["small", "standard", "vision", "reasoning", "tools"]
+        ] = 'standard',
+        temperature: float = None,
+        force_local: bool = False,
+    ):
+        """
+        Asynchronously generates a response based on the provided query and other parameters.
+        """
+        model = self._prepare_messages_and_model(query, user_input, context, None, images, model)
+        temperature = temperature if temperature else self.options["temperature"]
+        if not force_local:
+            try:
+                headers = self._build_headers(model, tools, False)
+                options = self._get_options(temperature)
+                response = await self._call_remote_api_async(model, tools, stream, options, None, headers)
+                # You can add async-specific response handling here if needed
+            except Exception as e:
+                traceback.print_exc()
+        if force_local or 'response' not in locals():
+            try:
+                return await self._call_local_ollama_async(model, stream, temperature)
+            except Exception as e:
+                traceback.print_exc()
+                return "Both remote API and local Ollama failed. An error occurred."
+
+    def make_summary(self, text):
+        # Implement your summary logic using self.client.chat()
+        summary_message = {
+            "role": "user",
+            "content": f'Summarize the text below:\n"""{text}"""\nRemember to be concise and detailed. Answer in English.',
+        }
+        messages = [
+            {
+                "role": "system",
+                "content": "You are summarizing a text. Make it detailed and concise. Answer ONLY with the summary. Don't add any new information.",
+            },
+            summary_message,
+        ]
+        try:
+            response = self.client.chat(
+                model=self.get_model("small"),
+                messages=messages,
+                options=Options(temperature=0.01),
+                keep_alive=3600 * 24 * 7,
+            )
+            summary = response.message.content.strip()
+            print_blue("Summary:", summary)
+            return summary
+        except ResponseError as e:
+            print_red("Error generating summary:", e)
+            return "Summary generation failed."
+
+    def read_stream(self, response):
+        """
+        Yields tuples of (chunk_type, text). The first tuple is ('thinking', ...)
+        if in_thinking is True and stops at </think>. After that, yields ('normal', ...)
+        for the rest of the text.
+        """
+        thinking_buffer = ""
+        in_thinking = self.call_model == self.get_model("reasoning")
+        first_chunk = True
+        prev_content = None
+
+        for chunk in response:
+            if not chunk:
+                continue
+            content = chunk.message.content
+
+            # Remove leading quote if it's the first chunk
+            if first_chunk and content.startswith('"'):
+                content = content[1:]
+            first_chunk = False
+
+            if in_thinking:
+                thinking_buffer += content
+                if "</think>" in thinking_buffer:
+                    end_idx = thinking_buffer.index("</think>") + len("</think>")
+                    yield ("thinking", thinking_buffer[:end_idx])
+                    remaining = thinking_buffer[end_idx:].strip('"')
+                    if chunk.done and remaining:
+                        yield ("normal", remaining)
+                        break
+                    else:
+                        prev_content = remaining
+                    in_thinking = False
+            else:
+                if prev_content:
+                    yield ("normal", prev_content)
+                prev_content = content
+
+            if chunk.done:
+                if prev_content and prev_content.endswith('"'):
+                    prev_content = prev_content[:-1]
+                if prev_content:
+                    yield ("normal", prev_content)
+                break
+
+        self.messages.append({"role": "assistant", "content": ""})
+
+
+    async def async_generate(
+        self,
+        query: str = None,
+        user_input: str = None,
+        context: str = None,
+        stream: bool = False,
+        tools: list = None,
+        images: list = None,
+        model: Optional[Literal["small", "standard", "vision"]] = None,
+        temperature: float = None,
+        force_local: bool = False,  # New parameter to force local Ollama
+    ):
+        """
+        Asynchronously generates a response based on the provided query and other parameters.
+        
+        Args:
+            query (str, optional): The query string to generate a response for.
+            user_input (str, optional): Additional user input to be included in the response.
+            context (str, optional): Context information to be used in generating the response.
+            stream (bool, optional): Whether to stream the response. Defaults to False.
+            tools (list, optional): List of tools to be used in generating the response. Will set the model to 'tools'.
+            images (list, optional): List of images to be included in the response.
+            model (Optional[Literal["small", "standard", "vision", "tools"]], optional): The model to be used for generating the response.
+            temperature (float, optional): The temperature setting for the model.
+            force_local (bool, optional): Force using local Ollama instead of remote API.
+
+        Returns:
+            str: The generated response or an error message if an exception occurs.
+        """
+        print_yellow("ASYNC GENERATE")
+        # Prepare the model and temperature
+        model = self._prepare_messages_and_model(query, user_input, context, None, images, model)
+        temperature = temperature if temperature else self.options["temperature"]
+
+        # First try with remote API
+        if not force_local:
+            try:
+                headers = self._build_headers(model, tools, False)
+                options = self._get_options(temperature)
+                response = await self._call_remote_api_async(model, tools, stream, options, None, headers)
+                
+                # Process response from async client
+                # [Rest of the response processing code as in the original method]
+                
+            except Exception as e:
+                print_red(f"Remote API error: {str(e)}")
+                print_yellow("Falling back to local Ollama...")
+                
+        # Fallback to local Ollama (for async we'll need to use the sync version)
+        if force_local or 'response' not in locals():
+            try:
+                return await self._call_local_ollama_async(model, stream, temperature)
+                    
+            except Exception as e:
+                print_red(f"Local Ollama error: {str(e)}")
+                return "Both remote API and local Ollama failed. An error occurred."
+
+    def prepare_images(self, images, message):
+        """
+        Prepares a list of images by converting them to base64 encoded strings and adds them to the provided message dictionary.
+        Args:
+            images (list): A list of images, where each image can be a file path (str), a base64 encoded string (str), or bytes.
+            message (dict): A dictionary to which the base64 encoded images will be added under the key "images".
+        Returns:
+            dict: The updated message dictionary with the base64 encoded images added under the key "images".
+        Raises:
+            ValueError: If an image is not a string or bytes.
+        """
+        import base64
+
+        base64_images = []
+        base64_pattern = re.compile(r"^[A-Za-z0-9+/]+={0,2}$")
+
+        for image in images:
+            if isinstance(image, str):
+                if base64_pattern.match(image):
+                    base64_images.append(image)
+                else:
+                    with open(image, "rb") as image_file:
+                        base64_images.append(
+                            base64.b64encode(image_file.read()).decode("utf-8")
+                        )
+            elif isinstance(image, bytes):
+                base64_images.append(base64.b64encode(image).decode("utf-8"))
+            else:
+                print_red("Invalid image type")
+
+            message["images"] = base64_images
+            return message
+
+def remove_thinking(response):
+    """Remove the thinking section from the response"""
+    response_text = response.content if hasattr(response, "content") else str(response)
+    if "</think>" in response_text:
+        return response_text.split("</think>")[1].strip()
+    return response_text
+    
+if __name__ == "__main__":
+
+    llm = LLM()
+
+    result = llm.generate(
+        query="I want to add 2 and 2",
+    )
+    print(result.content)
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,26 @@
+from setuptools import setup, find_packages
+
+setup(
+    name="_llm",
+    version="0.1.0",
+    packages=find_packages(),
+    install_requires=[
+        "requests",
+        "tiktoken",
+        "ollama",
+        "env_manager @ git+https://github.com/lasseedfast/env_manager.git",
+        "colorprinter @ git+https://github.com/lasseedfast/colorprinter.git",
+    ],
+    author="Lasse Edfast",
+    author_email="lasse@edfast.se",
+    description="A tool for interacting with LLM models",
+    long_description=open("README.md").read(),
+    long_description_content_type="text/markdown",
+    url="https://github.com/lasseedfast/_llm",
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires=">=3.6",
+)