Enhance README and LLM class: add native thinking feature support, update installation instructions, and improve streaming handling

9 months ago · 2431c667b1
parent 7360a1ed82
commit 2431c667b1
2 changed files with 142 additions and 30 deletions
--- a/README.md
+++ b/README.md
@ -2,6 +2,11 @@
 A Python package for interacting with LLM models through Ollama, supporting both remote API and local Ollama instances.
 ## Requirements
 - Python 3.8+
 - Ollama 0.9.0+ for native thinking feature support
 ## Installation
 Install directly from GitHub:
@ -34,6 +39,29 @@ This package requires:
 - tiktoken: For token counting
 - requests: For API communication
 ## Version Compatibility
 ### Ollama v0.9.0 Native Thinking Support
 This package leverages Ollama v0.9.0's native thinking feature. This allows models like qwen3, deepseek, and others to expose their reasoning process separately from their final answer.
 - **Remote API:** If using a remote API, ensure it runs on Ollama v0.9.0+
 - **Local Ollama:** Update to v0.9.0+ for native thinking support
 - **Backward Compatibility:** The library will attempt to handle both native thinking and older tag-based thinking (`<think>` tags)
 For the best experience with the thinking feature, ensure all Ollama instances (both local and remote) are updated to v0.9.0 or later.
 ### Native Thinking vs. Tag-Based Thinking
 | Feature | Native Thinking (v0.9.0+) | Tag-Based Thinking (older) |
 |---------|--------------------------|---------------------------|
 | API Support | Native parameter and response field | Manual parsing of text tags |
 | Content Separation | Clean separation of thinking and answer | Tags embedded in content |
 | Access Method | `response.thinking` attribute | Text parsing of `<think>` tags |
 | Streaming | Clean separation of thinking/content chunks | Manual detection of end tags |
 | Reliability | More reliable, officially supported | Relies on model output format |
 | Models | Works with all thinking-capable models | Works with models that follow tag conventions |
 ## Environment Variables
 The package requires several environment variables to be set:
@ -110,6 +138,46 @@ async def main():
 asyncio.run(main())
 ```
-## License
+### Using Thinking Mode
 The library supports Ollama's native thinking feature (v0.9.0+), which allows you to see the reasoning process of the model before it provides its final answer.
 ```python
 from llm_client import LLM
 # Use with models that support thinking (qwen3, deepseek, etc.)
 llm = LLM(model="reasoning")
 # Enable thinking mode with the new native Ollama v0.9.0+ support
 response = llm.generate(
    query="What would be the impact of increasing carbon taxes by 10%?",
    think=True
 )
 # Access thinking content (model's reasoning process)
 if hasattr(response, 'thinking') and response.thinking:
    print("Model's reasoning process:")
    print(response.thinking)
 # Access final answer
 print("Final answer:")
 print(response.content)
 ```
 When streaming with thinking enabled, you'll receive chunks with both types:
-MIT
+```python
 from llm_client import LLM
 llm = LLM(model="reasoning")
 for chunk_type, chunk in llm.generate(
    query="Solve this step by step: If x² + 3x - 10 = 0, what are the values of x?",
    stream=True,
    think=True
 ):
    if chunk_type == "thinking":
        print(f"Reasoning: {chunk}")
    else:
        print(f"Answer: {chunk}")
 ```
--- a/_llm/llm.py
+++ b/_llm/llm.py
@ -232,12 +232,22 @@ class LLM:
            def local_stream_adapter():
                for chunk in response_stream:
                    # Handle both content and thinking in streaming chunks
                    chunk_message = chunk["message"]
                    content = chunk_message.get("content", "")
                    thinking = chunk_message.get("thinking", None)
                    yield type(
                        "OllamaResponse",
                        (),
                        {
                            "message": type(
-                                "Message", (), {"content": chunk["message"]["content"]}
+                                "Message", 
                                (), 
                                {
                                    "content": content,
                                    "thinking": thinking  # Include thinking in stream chunks
                                }
                            ),
                            "done": chunk.get("done", False),
                        },
@ -253,7 +263,7 @@ class LLM:
            )
            result = response["message"]["content"]
-            # Handle thinking content if present (for backward compatibility)
+            # Handle thinking content if present with native support
            thinking_content = response["message"].get("thinking", None)
            response_obj = type(
@ -272,7 +282,7 @@ class LLM:
                },
            )
-            # No longer need to manually parse </think> tags with native support
+            # Store only the main content in message history
            self.messages.append({"role": "assistant", "content": result})
            if not self.chat:
                self.messages = [self.messages[0]]
@ -301,12 +311,22 @@ class LLM:
            async def local_stream_adapter():
                for chunk in response_stream:
                    # Handle both content and thinking in async streaming
                    chunk_message = chunk["message"]
                    content = chunk_message.get("content", "")
                    thinking = chunk_message.get("thinking", None)
                    yield type(
                        "OllamaResponse",
                        (),
                        {
                            "message": type(
-                                "Message", (), {"content": chunk["message"]["content"]}
+                                "Message", 
                                (), 
                                {
                                    "content": content,
                                    "thinking": thinking  # Include thinking in async stream chunks
                                }
                            ),
                            "done": chunk.get("done", False),
                        },
@ -326,7 +346,7 @@ class LLM:
            response_dict = await loop.run_in_executor(None, run_chat)
            result = response_dict["message"]["content"]
-            # Handle thinking content if present (for backward compatibility)
+            # Handle thinking content if present with native support
            thinking_content = response_dict["message"].get("thinking", None)
            # Create response object with thinking support
@ -411,16 +431,15 @@ class LLM:
                response = self._call_remote_api(
                    model, tools, stream, options, format, headers, think=think
                )
                print_rainbow(response)
                if stream:
                    return self.read_stream(response)
                else:
                    if isinstance(response, ChatResponse):
                        # With native thinking support, content is already clean
                        result = response.message.content.strip('"')
                        message_content = result.strip('"')
                        self.messages.append(
-                            {"role": "assistant", "content": message_content}
+                            {"role": "assistant", "content": result}
                        )
                        if not self.chat:
@ -492,24 +511,16 @@ class LLM:
                    return self.read_stream(response)
                else:
                    if isinstance(response, ChatResponse):
-                        # Handle native thinking mode with separate thinking field
+                        # With native thinking support, content is already clean
                        result = response.message.content.strip('"')
                        thinking_content = getattr(response.message, "thinking", None)
                        # Store both content and thinking in message history
                        message_content = result.strip('"')
                        self.messages.append(
-                            {"role": "assistant", "content": message_content}
+                            {"role": "assistant", "content": result}
                        )
                        if not self.chat:
                            self.messages = [self.messages[0]]
                        # Return response with both content and thinking accessible
                        if thinking_content and think:
                            # Add thinking as an attribute for access if needed
                            response.message.thinking = thinking_content
                        return response.message
                    else:
                        return "An error occurred."
@ -554,8 +565,8 @@ class LLM:
    def read_stream(self, response):
        """
-        Read streaming response and handle thinking content appropriately.
+        Read streaming response and handle thinking content with native Ollama v0.9.0+ support.
-        With native thinking mode, the thinking content is separate from the main content.
+        Thinking content is separate from main content and yielded as different chunk types.
        """
        accumulated_content = ""
        accumulated_thinking = ""
@ -564,7 +575,7 @@ class LLM:
            if not chunk:
                continue
-            # Handle thinking content (if present in streaming)
+            # Handle thinking content (native v0.9.0+ support)
            thinking_content = getattr(chunk.message, "thinking", None)
            if thinking_content:
                accumulated_thinking += thinking_content
@ -573,23 +584,27 @@ class LLM:
            # Handle regular content
            content = chunk.message.content
            if content:
-                # Remove leading/trailing quotes that sometimes appear
+                # Clean up quotes that sometimes appear in streaming
                if content.startswith('"') and len(accumulated_content) == 0:
                    content = content[1:]
                if chunk.done and content.endswith('"'):
                    content = content[:-1]
                accumulated_content += content
-                yield ("normal", content)
+                yield ("content", content)
            if chunk.done:
                break
-        # Store the complete response in message history
+        # Store the complete response in message history (without thinking content)
        self.messages.append({"role": "assistant", "content": accumulated_content})
        if not self.chat:
            self.messages = [self.messages[0]]
        # Yield complete thinking summary if accumulated
        if accumulated_thinking:
            yield ("thinking_complete", accumulated_thinking)
    def prepare_images(self, images, message):
        """
        Prepares a list of images by converting them to base64 encoded strings and adds them to the provided message dictionary.
@ -625,10 +640,39 @@ class LLM:
 if __name__ == "__main__":
-
+    # Example usage of the LLM class with thinking mode
    llm = LLM()
    # Basic usage
    result = llm.generate(
        query="I want to add 2 and 2",
    )
-    print(result.content)
+    print("Basic result:", result.content)
    # Example with thinking mode (for reasoning models)
    print("\n--- Thinking Mode Example ---")
    thinking_result = llm.generate(
        query="Solve this step by step: If I have 15 apples and give away 7, then buy 3 more, how many do I have?",
        model="reasoning",
        think=True
    )
    print("Answer:", thinking_result.content)
    if hasattr(thinking_result, 'thinking') and thinking_result.thinking:
        print("Model's reasoning:", thinking_result.thinking)
    # Example with streaming and thinking
    print("\n--- Streaming with Thinking Example ---")
    for chunk_type, chunk_content in llm.generate(
        query="Write a short explanation of photosynthesis",
        model="reasoning", 
        stream=True,
        think=True
    ):
        if chunk_type == "thinking":
            # Use print with blue color escape codes since print_blue doesn't support 'end' parameter
            print(f"\033[94m {chunk_content}\033[0m", end="")  # Show reasoning process in blue
        elif chunk_type == "content":
            print(chunk_content, end="")  # Show final answer
        elif chunk_type == "thinking_complete":
            print_green(f"\n💭 Complete reasoning available")
    print()  # Final newline