Enhance README and LLM class: add native thinking feature support, update installation instructions, and improve streaming handling

9 months ago · 2431c667b1
parent 7360a1ed82
commit 2431c667b1
2 changed files with 142 additions and 30 deletions
--- a/README.md
+++ b/README.md
@ -2,6 +2,11 @@

 A Python package for interacting with LLM models through Ollama, supporting both remote API and local Ollama instances.

+## Requirements
+
+- Python 3.8+
+- Ollama 0.9.0+ for native thinking feature support
+
 ## Installation

 Install directly from GitHub:
@ -34,6 +39,29 @@ This package requires:
 - tiktoken: For token counting
 - requests: For API communication

+## Version Compatibility
+
+### Ollama v0.9.0 Native Thinking Support
+
+This package leverages Ollama v0.9.0's native thinking feature. This allows models like qwen3, deepseek, and others to expose their reasoning process separately from their final answer.
+
+- **Remote API:** If using a remote API, ensure it runs on Ollama v0.9.0+
+- **Local Ollama:** Update to v0.9.0+ for native thinking support
+- **Backward Compatibility:** The library will attempt to handle both native thinking and older tag-based thinking (`<think>` tags)
+
+For the best experience with the thinking feature, ensure all Ollama instances (both local and remote) are updated to v0.9.0 or later.
+
+### Native Thinking vs. Tag-Based Thinking
+
+| Feature | Native Thinking (v0.9.0+) | Tag-Based Thinking (older) |
+|---------|--------------------------|---------------------------|
+| API Support | Native parameter and response field | Manual parsing of text tags |
+| Content Separation | Clean separation of thinking and answer | Tags embedded in content |
+| Access Method | `response.thinking` attribute | Text parsing of `<think>` tags |
+| Streaming | Clean separation of thinking/content chunks | Manual detection of end tags |
+| Reliability | More reliable, officially supported | Relies on model output format |
+| Models | Works with all thinking-capable models | Works with models that follow tag conventions |
+
 ## Environment Variables

 The package requires several environment variables to be set:
@ -110,6 +138,46 @@ async def main():
 asyncio.run(main())
 ```

-## License
+### Using Thinking Mode
+
+The library supports Ollama's native thinking feature (v0.9.0+), which allows you to see the reasoning process of the model before it provides its final answer.
+
+```python
+from llm_client import LLM
+
+# Use with models that support thinking (qwen3, deepseek, etc.)
+llm = LLM(model="reasoning")
+
+# Enable thinking mode with the new native Ollama v0.9.0+ support
+response = llm.generate(
+    query="What would be the impact of increasing carbon taxes by 10%?",
+    think=True
+)
+
+# Access thinking content (model's reasoning process)
+if hasattr(response, 'thinking') and response.thinking:
+    print("Model's reasoning process:")
+    print(response.thinking)
+
+# Access final answer
+print("Final answer:")
+print(response.content)
+```
+
+When streaming with thinking enabled, you'll receive chunks with both types:

-MIT
+```python
+from llm_client import LLM
+
+llm = LLM(model="reasoning")
+
+for chunk_type, chunk in llm.generate(
+    query="Solve this step by step: If x² + 3x - 10 = 0, what are the values of x?",
+    stream=True,
+    think=True
+):
+    if chunk_type == "thinking":
+        print(f"Reasoning: {chunk}")
+    else:
+        print(f"Answer: {chunk}")
+```
--- a/_llm/llm.py
+++ b/_llm/llm.py
@ -232,12 +232,22 @@ class LLM:

            def local_stream_adapter():
                for chunk in response_stream:
+                    # Handle both content and thinking in streaming chunks
+                    chunk_message = chunk["message"]
+                    content = chunk_message.get("content", "")
+                    thinking = chunk_message.get("thinking", None)
+                    
                    yield type(
                        "OllamaResponse",
                        (),
                        {
                            "message": type(
-                                "Message", (), {"content": chunk["message"]["content"]}
+                                "Message", 
+                                (), 
+                                {
+                                    "content": content,
+                                    "thinking": thinking  # Include thinking in stream chunks
+                                }
                            ),
                            "done": chunk.get("done", False),
                        },
@ -253,7 +263,7 @@ class LLM:
            )
            result = response["message"]["content"]

-            # Handle thinking content if present (for backward compatibility)
+            # Handle thinking content if present with native support
            thinking_content = response["message"].get("thinking", None)

            response_obj = type(
@ -272,7 +282,7 @@ class LLM:
                },
            )

-            # No longer need to manually parse </think> tags with native support
+            # Store only the main content in message history
            self.messages.append({"role": "assistant", "content": result})
            if not self.chat:
                self.messages = [self.messages[0]]
@ -301,12 +311,22 @@ class LLM:

            async def local_stream_adapter():
                for chunk in response_stream:
+                    # Handle both content and thinking in async streaming
+                    chunk_message = chunk["message"]
+                    content = chunk_message.get("content", "")
+                    thinking = chunk_message.get("thinking", None)
+                    
                    yield type(
                        "OllamaResponse",
                        (),
                        {
                            "message": type(
-                                "Message", (), {"content": chunk["message"]["content"]}
+                                "Message", 
+                                (), 
+                                {
+                                    "content": content,
+                                    "thinking": thinking  # Include thinking in async stream chunks
+                                }
                            ),
                            "done": chunk.get("done", False),
                        },
@ -326,7 +346,7 @@ class LLM:
            response_dict = await loop.run_in_executor(None, run_chat)
            result = response_dict["message"]["content"]

-            # Handle thinking content if present (for backward compatibility)
+            # Handle thinking content if present with native support
            thinking_content = response_dict["message"].get("thinking", None)

            # Create response object with thinking support
@ -411,16 +431,15 @@ class LLM:
                response = self._call_remote_api(
                    model, tools, stream, options, format, headers, think=think
                )
-                print_rainbow(response)
                if stream:
                    return self.read_stream(response)
                else:
                    if isinstance(response, ChatResponse):
+                        # With native thinking support, content is already clean
                        result = response.message.content.strip('"')
                        
-                        message_content = result.strip('"')
                        self.messages.append(
-                            {"role": "assistant", "content": message_content}
+                            {"role": "assistant", "content": result}
                        )

                        if not self.chat:
@ -492,24 +511,16 @@ class LLM:
                    return self.read_stream(response)
                else:
                    if isinstance(response, ChatResponse):
-                        # Handle native thinking mode with separate thinking field
+                        # With native thinking support, content is already clean
                        result = response.message.content.strip('"')
-                        thinking_content = getattr(response.message, "thinking", None)
                        
-                        # Store both content and thinking in message history
-                        message_content = result.strip('"')
                        self.messages.append(
-                            {"role": "assistant", "content": message_content}
+                            {"role": "assistant", "content": result}
                        )

                        if not self.chat:
                            self.messages = [self.messages[0]]

-                        # Return response with both content and thinking accessible
-                        if thinking_content and think:
-                            # Add thinking as an attribute for access if needed
-                            response.message.thinking = thinking_content
-
                        return response.message
                    else:
                        return "An error occurred."
@ -554,8 +565,8 @@ class LLM:

    def read_stream(self, response):
        """
-        Read streaming response and handle thinking content appropriately.
-        With native thinking mode, the thinking content is separate from the main content.
+        Read streaming response and handle thinking content with native Ollama v0.9.0+ support.
+        Thinking content is separate from main content and yielded as different chunk types.
        """
        accumulated_content = ""
        accumulated_thinking = ""
@ -564,7 +575,7 @@ class LLM:
            if not chunk:
                continue

-            # Handle thinking content (if present in streaming)
+            # Handle thinking content (native v0.9.0+ support)
            thinking_content = getattr(chunk.message, "thinking", None)
            if thinking_content:
                accumulated_thinking += thinking_content
@ -573,23 +584,27 @@ class LLM:
            # Handle regular content
            content = chunk.message.content
            if content:
-                # Remove leading/trailing quotes that sometimes appear
+                # Clean up quotes that sometimes appear in streaming
                if content.startswith('"') and len(accumulated_content) == 0:
                    content = content[1:]
                if chunk.done and content.endswith('"'):
                    content = content[:-1]

                accumulated_content += content
-                yield ("normal", content)
+                yield ("content", content)

            if chunk.done:
                break

-        # Store the complete response in message history
+        # Store the complete response in message history (without thinking content)
        self.messages.append({"role": "assistant", "content": accumulated_content})
        if not self.chat:
            self.messages = [self.messages[0]]

+        # Yield complete thinking summary if accumulated
+        if accumulated_thinking:
+            yield ("thinking_complete", accumulated_thinking)
+
    def prepare_images(self, images, message):
        """
        Prepares a list of images by converting them to base64 encoded strings and adds them to the provided message dictionary.
@ -625,10 +640,39 @@ class LLM:


 if __name__ == "__main__":
-
+    # Example usage of the LLM class with thinking mode
    llm = LLM()

+    # Basic usage
    result = llm.generate(
        query="I want to add 2 and 2",
    )
-    print(result.content)
+    print("Basic result:", result.content)
+
+    # Example with thinking mode (for reasoning models)
+    print("\n--- Thinking Mode Example ---")
+    thinking_result = llm.generate(
+        query="Solve this step by step: If I have 15 apples and give away 7, then buy 3 more, how many do I have?",
+        model="reasoning",
+        think=True
+    )
+    print("Answer:", thinking_result.content)
+    if hasattr(thinking_result, 'thinking') and thinking_result.thinking:
+        print("Model's reasoning:", thinking_result.thinking)
+
+    # Example with streaming and thinking
+    print("\n--- Streaming with Thinking Example ---")
+    for chunk_type, chunk_content in llm.generate(
+        query="Write a short explanation of photosynthesis",
+        model="reasoning", 
+        stream=True,
+        think=True
+    ):
+        if chunk_type == "thinking":
+            # Use print with blue color escape codes since print_blue doesn't support 'end' parameter
+            print(f"\033[94m {chunk_content}\033[0m", end="")  # Show reasoning process in blue
+        elif chunk_type == "content":
+            print(chunk_content, end="")  # Show final answer
+        elif chunk_type == "thinking_complete":
+            print_green(f"\n💭 Complete reasoning available")
+    print()  # Final newline