From 2431c667b1771b6c2e15bb338f3f182713d96827 Mon Sep 17 00:00:00 2001
From: lasseedfast <>
Date: Fri, 30 May 2025 21:34:17 +0200
Subject: [PATCH] Enhance README and LLM class: add native thinking feature
 support, update installation instructions, and improve streaming handling

---
 README.md   |  72 +++++++++++++++++++++++++++++++++++--
 _llm/llm.py | 100 +++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 142 insertions(+), 30 deletions(-)
diff --git a/README.md b/README.md
index 6c69ed3..8ba47a7 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,11 @@
 
 A Python package for interacting with LLM models through Ollama, supporting both remote API and local Ollama instances.
 
+## Requirements
+
+- Python 3.8+
+- Ollama 0.9.0+ for native thinking feature support
+
 ## Installation
 
 Install directly from GitHub:
@@ -34,6 +39,29 @@ This package requires:
 - tiktoken: For token counting
 - requests: For API communication
 
+## Version Compatibility
+
+### Ollama v0.9.0 Native Thinking Support
+
+This package leverages Ollama v0.9.0's native thinking feature. This allows models like qwen3, deepseek, and others to expose their reasoning process separately from their final answer.
+
+- **Remote API:** If using a remote API, ensure it runs on Ollama v0.9.0+
+- **Local Ollama:** Update to v0.9.0+ for native thinking support
+- **Backward Compatibility:** The library will attempt to handle both native thinking and older tag-based thinking (`<think>` tags)
+
+For the best experience with the thinking feature, ensure all Ollama instances (both local and remote) are updated to v0.9.0 or later.
+
+### Native Thinking vs. Tag-Based Thinking
+
+| Feature | Native Thinking (v0.9.0+) | Tag-Based Thinking (older) |
+|---------|--------------------------|---------------------------|
+| API Support | Native parameter and response field | Manual parsing of text tags |
+| Content Separation | Clean separation of thinking and answer | Tags embedded in content |
+| Access Method | `response.thinking` attribute | Text parsing of `<think>` tags |
+| Streaming | Clean separation of thinking/content chunks | Manual detection of end tags |
+| Reliability | More reliable, officially supported | Relies on model output format |
+| Models | Works with all thinking-capable models | Works with models that follow tag conventions |
+
 ## Environment Variables
 
 The package requires several environment variables to be set:
@@ -110,6 +138,46 @@ async def main():
 asyncio.run(main())
 ```
 
-## License
+### Using Thinking Mode
+
+The library supports Ollama's native thinking feature (v0.9.0+), which allows you to see the reasoning process of the model before it provides its final answer.
+
+```python
+from llm_client import LLM
+
+# Use with models that support thinking (qwen3, deepseek, etc.)
+llm = LLM(model="reasoning")
+
+# Enable thinking mode with the new native Ollama v0.9.0+ support
+response = llm.generate(
+    query="What would be the impact of increasing carbon taxes by 10%?",
+    think=True
+)
+
+# Access thinking content (model's reasoning process)
+if hasattr(response, 'thinking') and response.thinking:
+    print("Model's reasoning process:")
+    print(response.thinking)
+
+# Access final answer
+print("Final answer:")
+print(response.content)
+```
+
+When streaming with thinking enabled, you'll receive chunks with both types:
 
-MIT
\ No newline at end of file
+```python
+from llm_client import LLM
+
+llm = LLM(model="reasoning")
+
+for chunk_type, chunk in llm.generate(
+    query="Solve this step by step: If x² + 3x - 10 = 0, what are the values of x?",
+    stream=True,
+    think=True
+):
+    if chunk_type == "thinking":
+        print(f"Reasoning: {chunk}")
+    else:
+        print(f"Answer: {chunk}")
+```
diff --git a/_llm/llm.py b/_llm/llm.py
index 615a264..2de1144 100644
--- a/_llm/llm.py
+++ b/_llm/llm.py
@@ -232,12 +232,22 @@ class LLM:
 
             def local_stream_adapter():
                 for chunk in response_stream:
+                    # Handle both content and thinking in streaming chunks
+                    chunk_message = chunk["message"]
+                    content = chunk_message.get("content", "")
+                    thinking = chunk_message.get("thinking", None)
+                    
                     yield type(
                         "OllamaResponse",
                         (),
                         {
                             "message": type(
-                                "Message", (), {"content": chunk["message"]["content"]}
+                                "Message", 
+                                (), 
+                                {
+                                    "content": content,
+                                    "thinking": thinking  # Include thinking in stream chunks
+                                }
                             ),
                             "done": chunk.get("done", False),
                         },
@@ -253,7 +263,7 @@ class LLM:
             )
             result = response["message"]["content"]
 
-            # Handle thinking content if present (for backward compatibility)
+            # Handle thinking content if present with native support
             thinking_content = response["message"].get("thinking", None)
 
             response_obj = type(
@@ -272,7 +282,7 @@ class LLM:
                 },
             )
 
-            # No longer need to manually parse </think> tags with native support
+            # Store only the main content in message history
             self.messages.append({"role": "assistant", "content": result})
             if not self.chat:
                 self.messages = [self.messages[0]]
@@ -301,12 +311,22 @@ class LLM:
 
             async def local_stream_adapter():
                 for chunk in response_stream:
+                    # Handle both content and thinking in async streaming
+                    chunk_message = chunk["message"]
+                    content = chunk_message.get("content", "")
+                    thinking = chunk_message.get("thinking", None)
+                    
                     yield type(
                         "OllamaResponse",
                         (),
                         {
                             "message": type(
-                                "Message", (), {"content": chunk["message"]["content"]}
+                                "Message", 
+                                (), 
+                                {
+                                    "content": content,
+                                    "thinking": thinking  # Include thinking in async stream chunks
+                                }
                             ),
                             "done": chunk.get("done", False),
                         },
@@ -326,7 +346,7 @@ class LLM:
             response_dict = await loop.run_in_executor(None, run_chat)
             result = response_dict["message"]["content"]
 
-            # Handle thinking content if present (for backward compatibility)
+            # Handle thinking content if present with native support
             thinking_content = response_dict["message"].get("thinking", None)
 
             # Create response object with thinking support
@@ -411,16 +431,15 @@ class LLM:
                 response = self._call_remote_api(
                     model, tools, stream, options, format, headers, think=think
                 )
-                print_rainbow(response)
                 if stream:
                     return self.read_stream(response)
                 else:
                     if isinstance(response, ChatResponse):
+                        # With native thinking support, content is already clean
                         result = response.message.content.strip('"')
-
-                        message_content = result.strip('"')
+                        
                         self.messages.append(
-                            {"role": "assistant", "content": message_content}
+                            {"role": "assistant", "content": result}
                         )
 
                         if not self.chat:
@@ -492,24 +511,16 @@ class LLM:
                     return self.read_stream(response)
                 else:
                     if isinstance(response, ChatResponse):
-                        # Handle native thinking mode with separate thinking field
+                        # With native thinking support, content is already clean
                         result = response.message.content.strip('"')
-                        thinking_content = getattr(response.message, "thinking", None)
-
-                        # Store both content and thinking in message history
-                        message_content = result.strip('"')
+                        
                         self.messages.append(
-                            {"role": "assistant", "content": message_content}
+                            {"role": "assistant", "content": result}
                         )
 
                         if not self.chat:
                             self.messages = [self.messages[0]]
 
-                        # Return response with both content and thinking accessible
-                        if thinking_content and think:
-                            # Add thinking as an attribute for access if needed
-                            response.message.thinking = thinking_content
-
                         return response.message
                     else:
                         return "An error occurred."
@@ -554,8 +565,8 @@ class LLM:
 
     def read_stream(self, response):
         """
-        Read streaming response and handle thinking content appropriately.
-        With native thinking mode, the thinking content is separate from the main content.
+        Read streaming response and handle thinking content with native Ollama v0.9.0+ support.
+        Thinking content is separate from main content and yielded as different chunk types.
         """
         accumulated_content = ""
         accumulated_thinking = ""
@@ -564,7 +575,7 @@ class LLM:
             if not chunk:
                 continue
 
-            # Handle thinking content (if present in streaming)
+            # Handle thinking content (native v0.9.0+ support)
             thinking_content = getattr(chunk.message, "thinking", None)
             if thinking_content:
                 accumulated_thinking += thinking_content
@@ -573,23 +584,27 @@ class LLM:
             # Handle regular content
             content = chunk.message.content
             if content:
-                # Remove leading/trailing quotes that sometimes appear
+                # Clean up quotes that sometimes appear in streaming
                 if content.startswith('"') and len(accumulated_content) == 0:
                     content = content[1:]
                 if chunk.done and content.endswith('"'):
                     content = content[:-1]
 
                 accumulated_content += content
-                yield ("normal", content)
+                yield ("content", content)
 
             if chunk.done:
                 break
 
-        # Store the complete response in message history
+        # Store the complete response in message history (without thinking content)
         self.messages.append({"role": "assistant", "content": accumulated_content})
         if not self.chat:
             self.messages = [self.messages[0]]
 
+        # Yield complete thinking summary if accumulated
+        if accumulated_thinking:
+            yield ("thinking_complete", accumulated_thinking)
+
     def prepare_images(self, images, message):
         """
         Prepares a list of images by converting them to base64 encoded strings and adds them to the provided message dictionary.
@@ -625,10 +640,39 @@ class LLM:
 
 
 if __name__ == "__main__":
-
+    # Example usage of the LLM class with thinking mode
     llm = LLM()
 
+    # Basic usage
     result = llm.generate(
         query="I want to add 2 and 2",
     )
-    print(result.content)
+    print("Basic result:", result.content)
+
+    # Example with thinking mode (for reasoning models)
+    print("\n--- Thinking Mode Example ---")
+    thinking_result = llm.generate(
+        query="Solve this step by step: If I have 15 apples and give away 7, then buy 3 more, how many do I have?",
+        model="reasoning",
+        think=True
+    )
+    print("Answer:", thinking_result.content)
+    if hasattr(thinking_result, 'thinking') and thinking_result.thinking:
+        print("Model's reasoning:", thinking_result.thinking)
+
+    # Example with streaming and thinking
+    print("\n--- Streaming with Thinking Example ---")
+    for chunk_type, chunk_content in llm.generate(
+        query="Write a short explanation of photosynthesis",
+        model="reasoning", 
+        stream=True,
+        think=True
+    ):
+        if chunk_type == "thinking":
+            # Use print with blue color escape codes since print_blue doesn't support 'end' parameter
+            print(f"\033[94m {chunk_content}\033[0m", end="")  # Show reasoning process in blue
+        elif chunk_type == "content":
+            print(chunk_content, end="")  # Show final answer
+        elif chunk_type == "thinking_complete":
+            print_green(f"\n💭 Complete reasoning available")
+    print()  # Final newline