From 2431c667b1771b6c2e15bb338f3f182713d96827 Mon Sep 17 00:00:00 2001 From: lasseedfast <> Date: Fri, 30 May 2025 21:34:17 +0200 Subject: [PATCH] Enhance README and LLM class: add native thinking feature support, update installation instructions, and improve streaming handling --- README.md | 72 +++++++++++++++++++++++++++++++++++-- _llm/llm.py | 100 +++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 142 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 6c69ed3..8ba47a7 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,11 @@ A Python package for interacting with LLM models through Ollama, supporting both remote API and local Ollama instances. +## Requirements + +- Python 3.8+ +- Ollama 0.9.0+ for native thinking feature support + ## Installation Install directly from GitHub: @@ -34,6 +39,29 @@ This package requires: - tiktoken: For token counting - requests: For API communication +## Version Compatibility + +### Ollama v0.9.0 Native Thinking Support + +This package leverages Ollama v0.9.0's native thinking feature. This allows models like qwen3, deepseek, and others to expose their reasoning process separately from their final answer. + +- **Remote API:** If using a remote API, ensure it runs on Ollama v0.9.0+ +- **Local Ollama:** Update to v0.9.0+ for native thinking support +- **Backward Compatibility:** The library will attempt to handle both native thinking and older tag-based thinking (`` tags) + +For the best experience with the thinking feature, ensure all Ollama instances (both local and remote) are updated to v0.9.0 or later. + +### Native Thinking vs. Tag-Based Thinking + +| Feature | Native Thinking (v0.9.0+) | Tag-Based Thinking (older) | +|---------|--------------------------|---------------------------| +| API Support | Native parameter and response field | Manual parsing of text tags | +| Content Separation | Clean separation of thinking and answer | Tags embedded in content | +| Access Method | `response.thinking` attribute | Text parsing of `` tags | +| Streaming | Clean separation of thinking/content chunks | Manual detection of end tags | +| Reliability | More reliable, officially supported | Relies on model output format | +| Models | Works with all thinking-capable models | Works with models that follow tag conventions | + ## Environment Variables The package requires several environment variables to be set: @@ -110,6 +138,46 @@ async def main(): asyncio.run(main()) ``` -## License +### Using Thinking Mode + +The library supports Ollama's native thinking feature (v0.9.0+), which allows you to see the reasoning process of the model before it provides its final answer. + +```python +from llm_client import LLM + +# Use with models that support thinking (qwen3, deepseek, etc.) +llm = LLM(model="reasoning") + +# Enable thinking mode with the new native Ollama v0.9.0+ support +response = llm.generate( + query="What would be the impact of increasing carbon taxes by 10%?", + think=True +) + +# Access thinking content (model's reasoning process) +if hasattr(response, 'thinking') and response.thinking: + print("Model's reasoning process:") + print(response.thinking) + +# Access final answer +print("Final answer:") +print(response.content) +``` + +When streaming with thinking enabled, you'll receive chunks with both types: -MIT \ No newline at end of file +```python +from llm_client import LLM + +llm = LLM(model="reasoning") + +for chunk_type, chunk in llm.generate( + query="Solve this step by step: If x² + 3x - 10 = 0, what are the values of x?", + stream=True, + think=True +): + if chunk_type == "thinking": + print(f"Reasoning: {chunk}") + else: + print(f"Answer: {chunk}") +``` diff --git a/_llm/llm.py b/_llm/llm.py index 615a264..2de1144 100644 --- a/_llm/llm.py +++ b/_llm/llm.py @@ -232,12 +232,22 @@ class LLM: def local_stream_adapter(): for chunk in response_stream: + # Handle both content and thinking in streaming chunks + chunk_message = chunk["message"] + content = chunk_message.get("content", "") + thinking = chunk_message.get("thinking", None) + yield type( "OllamaResponse", (), { "message": type( - "Message", (), {"content": chunk["message"]["content"]} + "Message", + (), + { + "content": content, + "thinking": thinking # Include thinking in stream chunks + } ), "done": chunk.get("done", False), }, @@ -253,7 +263,7 @@ class LLM: ) result = response["message"]["content"] - # Handle thinking content if present (for backward compatibility) + # Handle thinking content if present with native support thinking_content = response["message"].get("thinking", None) response_obj = type( @@ -272,7 +282,7 @@ class LLM: }, ) - # No longer need to manually parse tags with native support + # Store only the main content in message history self.messages.append({"role": "assistant", "content": result}) if not self.chat: self.messages = [self.messages[0]] @@ -301,12 +311,22 @@ class LLM: async def local_stream_adapter(): for chunk in response_stream: + # Handle both content and thinking in async streaming + chunk_message = chunk["message"] + content = chunk_message.get("content", "") + thinking = chunk_message.get("thinking", None) + yield type( "OllamaResponse", (), { "message": type( - "Message", (), {"content": chunk["message"]["content"]} + "Message", + (), + { + "content": content, + "thinking": thinking # Include thinking in async stream chunks + } ), "done": chunk.get("done", False), }, @@ -326,7 +346,7 @@ class LLM: response_dict = await loop.run_in_executor(None, run_chat) result = response_dict["message"]["content"] - # Handle thinking content if present (for backward compatibility) + # Handle thinking content if present with native support thinking_content = response_dict["message"].get("thinking", None) # Create response object with thinking support @@ -411,16 +431,15 @@ class LLM: response = self._call_remote_api( model, tools, stream, options, format, headers, think=think ) - print_rainbow(response) if stream: return self.read_stream(response) else: if isinstance(response, ChatResponse): + # With native thinking support, content is already clean result = response.message.content.strip('"') - - message_content = result.strip('"') + self.messages.append( - {"role": "assistant", "content": message_content} + {"role": "assistant", "content": result} ) if not self.chat: @@ -492,24 +511,16 @@ class LLM: return self.read_stream(response) else: if isinstance(response, ChatResponse): - # Handle native thinking mode with separate thinking field + # With native thinking support, content is already clean result = response.message.content.strip('"') - thinking_content = getattr(response.message, "thinking", None) - - # Store both content and thinking in message history - message_content = result.strip('"') + self.messages.append( - {"role": "assistant", "content": message_content} + {"role": "assistant", "content": result} ) if not self.chat: self.messages = [self.messages[0]] - # Return response with both content and thinking accessible - if thinking_content and think: - # Add thinking as an attribute for access if needed - response.message.thinking = thinking_content - return response.message else: return "An error occurred." @@ -554,8 +565,8 @@ class LLM: def read_stream(self, response): """ - Read streaming response and handle thinking content appropriately. - With native thinking mode, the thinking content is separate from the main content. + Read streaming response and handle thinking content with native Ollama v0.9.0+ support. + Thinking content is separate from main content and yielded as different chunk types. """ accumulated_content = "" accumulated_thinking = "" @@ -564,7 +575,7 @@ class LLM: if not chunk: continue - # Handle thinking content (if present in streaming) + # Handle thinking content (native v0.9.0+ support) thinking_content = getattr(chunk.message, "thinking", None) if thinking_content: accumulated_thinking += thinking_content @@ -573,23 +584,27 @@ class LLM: # Handle regular content content = chunk.message.content if content: - # Remove leading/trailing quotes that sometimes appear + # Clean up quotes that sometimes appear in streaming if content.startswith('"') and len(accumulated_content) == 0: content = content[1:] if chunk.done and content.endswith('"'): content = content[:-1] accumulated_content += content - yield ("normal", content) + yield ("content", content) if chunk.done: break - # Store the complete response in message history + # Store the complete response in message history (without thinking content) self.messages.append({"role": "assistant", "content": accumulated_content}) if not self.chat: self.messages = [self.messages[0]] + # Yield complete thinking summary if accumulated + if accumulated_thinking: + yield ("thinking_complete", accumulated_thinking) + def prepare_images(self, images, message): """ Prepares a list of images by converting them to base64 encoded strings and adds them to the provided message dictionary. @@ -625,10 +640,39 @@ class LLM: if __name__ == "__main__": - + # Example usage of the LLM class with thinking mode llm = LLM() + # Basic usage result = llm.generate( query="I want to add 2 and 2", ) - print(result.content) + print("Basic result:", result.content) + + # Example with thinking mode (for reasoning models) + print("\n--- Thinking Mode Example ---") + thinking_result = llm.generate( + query="Solve this step by step: If I have 15 apples and give away 7, then buy 3 more, how many do I have?", + model="reasoning", + think=True + ) + print("Answer:", thinking_result.content) + if hasattr(thinking_result, 'thinking') and thinking_result.thinking: + print("Model's reasoning:", thinking_result.thinking) + + # Example with streaming and thinking + print("\n--- Streaming with Thinking Example ---") + for chunk_type, chunk_content in llm.generate( + query="Write a short explanation of photosynthesis", + model="reasoning", + stream=True, + think=True + ): + if chunk_type == "thinking": + # Use print with blue color escape codes since print_blue doesn't support 'end' parameter + print(f"\033[94m {chunk_content}\033[0m", end="") # Show reasoning process in blue + elif chunk_type == "content": + print(chunk_content, end="") # Show final answer + elif chunk_type == "thinking_complete": + print_green(f"\n💭 Complete reasoning available") + print() # Final newline