Enhance README and LLM class: add native thinking feature support, update installation instructions, and improve streaming handling

legacy
lasseedfast 9 months ago
parent 7360a1ed82
commit 2431c667b1
  1. 72
      README.md
  2. 96
      _llm/llm.py

@ -2,6 +2,11 @@
A Python package for interacting with LLM models through Ollama, supporting both remote API and local Ollama instances.
## Requirements
- Python 3.8+
- Ollama 0.9.0+ for native thinking feature support
## Installation
Install directly from GitHub:
@ -34,6 +39,29 @@ This package requires:
- tiktoken: For token counting
- requests: For API communication
## Version Compatibility
### Ollama v0.9.0 Native Thinking Support
This package leverages Ollama v0.9.0's native thinking feature. This allows models like qwen3, deepseek, and others to expose their reasoning process separately from their final answer.
- **Remote API:** If using a remote API, ensure it runs on Ollama v0.9.0+
- **Local Ollama:** Update to v0.9.0+ for native thinking support
- **Backward Compatibility:** The library will attempt to handle both native thinking and older tag-based thinking (`<think>` tags)
For the best experience with the thinking feature, ensure all Ollama instances (both local and remote) are updated to v0.9.0 or later.
### Native Thinking vs. Tag-Based Thinking
| Feature | Native Thinking (v0.9.0+) | Tag-Based Thinking (older) |
|---------|--------------------------|---------------------------|
| API Support | Native parameter and response field | Manual parsing of text tags |
| Content Separation | Clean separation of thinking and answer | Tags embedded in content |
| Access Method | `response.thinking` attribute | Text parsing of `<think>` tags |
| Streaming | Clean separation of thinking/content chunks | Manual detection of end tags |
| Reliability | More reliable, officially supported | Relies on model output format |
| Models | Works with all thinking-capable models | Works with models that follow tag conventions |
## Environment Variables
The package requires several environment variables to be set:
@ -110,6 +138,46 @@ async def main():
asyncio.run(main())
```
## License
### Using Thinking Mode
The library supports Ollama's native thinking feature (v0.9.0+), which allows you to see the reasoning process of the model before it provides its final answer.
```python
from llm_client import LLM
# Use with models that support thinking (qwen3, deepseek, etc.)
llm = LLM(model="reasoning")
# Enable thinking mode with the new native Ollama v0.9.0+ support
response = llm.generate(
query="What would be the impact of increasing carbon taxes by 10%?",
think=True
)
# Access thinking content (model's reasoning process)
if hasattr(response, 'thinking') and response.thinking:
print("Model's reasoning process:")
print(response.thinking)
# Access final answer
print("Final answer:")
print(response.content)
```
When streaming with thinking enabled, you'll receive chunks with both types:
MIT
```python
from llm_client import LLM
llm = LLM(model="reasoning")
for chunk_type, chunk in llm.generate(
query="Solve this step by step: If x² + 3x - 10 = 0, what are the values of x?",
stream=True,
think=True
):
if chunk_type == "thinking":
print(f"Reasoning: {chunk}")
else:
print(f"Answer: {chunk}")
```

@ -232,12 +232,22 @@ class LLM:
def local_stream_adapter():
for chunk in response_stream:
# Handle both content and thinking in streaming chunks
chunk_message = chunk["message"]
content = chunk_message.get("content", "")
thinking = chunk_message.get("thinking", None)
yield type(
"OllamaResponse",
(),
{
"message": type(
"Message", (), {"content": chunk["message"]["content"]}
"Message",
(),
{
"content": content,
"thinking": thinking # Include thinking in stream chunks
}
),
"done": chunk.get("done", False),
},
@ -253,7 +263,7 @@ class LLM:
)
result = response["message"]["content"]
# Handle thinking content if present (for backward compatibility)
# Handle thinking content if present with native support
thinking_content = response["message"].get("thinking", None)
response_obj = type(
@ -272,7 +282,7 @@ class LLM:
},
)
# No longer need to manually parse </think> tags with native support
# Store only the main content in message history
self.messages.append({"role": "assistant", "content": result})
if not self.chat:
self.messages = [self.messages[0]]
@ -301,12 +311,22 @@ class LLM:
async def local_stream_adapter():
for chunk in response_stream:
# Handle both content and thinking in async streaming
chunk_message = chunk["message"]
content = chunk_message.get("content", "")
thinking = chunk_message.get("thinking", None)
yield type(
"OllamaResponse",
(),
{
"message": type(
"Message", (), {"content": chunk["message"]["content"]}
"Message",
(),
{
"content": content,
"thinking": thinking # Include thinking in async stream chunks
}
),
"done": chunk.get("done", False),
},
@ -326,7 +346,7 @@ class LLM:
response_dict = await loop.run_in_executor(None, run_chat)
result = response_dict["message"]["content"]
# Handle thinking content if present (for backward compatibility)
# Handle thinking content if present with native support
thinking_content = response_dict["message"].get("thinking", None)
# Create response object with thinking support
@ -411,16 +431,15 @@ class LLM:
response = self._call_remote_api(
model, tools, stream, options, format, headers, think=think
)
print_rainbow(response)
if stream:
return self.read_stream(response)
else:
if isinstance(response, ChatResponse):
# With native thinking support, content is already clean
result = response.message.content.strip('"')
message_content = result.strip('"')
self.messages.append(
{"role": "assistant", "content": message_content}
{"role": "assistant", "content": result}
)
if not self.chat:
@ -492,24 +511,16 @@ class LLM:
return self.read_stream(response)
else:
if isinstance(response, ChatResponse):
# Handle native thinking mode with separate thinking field
# With native thinking support, content is already clean
result = response.message.content.strip('"')
thinking_content = getattr(response.message, "thinking", None)
# Store both content and thinking in message history
message_content = result.strip('"')
self.messages.append(
{"role": "assistant", "content": message_content}
{"role": "assistant", "content": result}
)
if not self.chat:
self.messages = [self.messages[0]]
# Return response with both content and thinking accessible
if thinking_content and think:
# Add thinking as an attribute for access if needed
response.message.thinking = thinking_content
return response.message
else:
return "An error occurred."
@ -554,8 +565,8 @@ class LLM:
def read_stream(self, response):
"""
Read streaming response and handle thinking content appropriately.
With native thinking mode, the thinking content is separate from the main content.
Read streaming response and handle thinking content with native Ollama v0.9.0+ support.
Thinking content is separate from main content and yielded as different chunk types.
"""
accumulated_content = ""
accumulated_thinking = ""
@ -564,7 +575,7 @@ class LLM:
if not chunk:
continue
# Handle thinking content (if present in streaming)
# Handle thinking content (native v0.9.0+ support)
thinking_content = getattr(chunk.message, "thinking", None)
if thinking_content:
accumulated_thinking += thinking_content
@ -573,23 +584,27 @@ class LLM:
# Handle regular content
content = chunk.message.content
if content:
# Remove leading/trailing quotes that sometimes appear
# Clean up quotes that sometimes appear in streaming
if content.startswith('"') and len(accumulated_content) == 0:
content = content[1:]
if chunk.done and content.endswith('"'):
content = content[:-1]
accumulated_content += content
yield ("normal", content)
yield ("content", content)
if chunk.done:
break
# Store the complete response in message history
# Store the complete response in message history (without thinking content)
self.messages.append({"role": "assistant", "content": accumulated_content})
if not self.chat:
self.messages = [self.messages[0]]
# Yield complete thinking summary if accumulated
if accumulated_thinking:
yield ("thinking_complete", accumulated_thinking)
def prepare_images(self, images, message):
"""
Prepares a list of images by converting them to base64 encoded strings and adds them to the provided message dictionary.
@ -625,10 +640,39 @@ class LLM:
if __name__ == "__main__":
# Example usage of the LLM class with thinking mode
llm = LLM()
# Basic usage
result = llm.generate(
query="I want to add 2 and 2",
)
print(result.content)
print("Basic result:", result.content)
# Example with thinking mode (for reasoning models)
print("\n--- Thinking Mode Example ---")
thinking_result = llm.generate(
query="Solve this step by step: If I have 15 apples and give away 7, then buy 3 more, how many do I have?",
model="reasoning",
think=True
)
print("Answer:", thinking_result.content)
if hasattr(thinking_result, 'thinking') and thinking_result.thinking:
print("Model's reasoning:", thinking_result.thinking)
# Example with streaming and thinking
print("\n--- Streaming with Thinking Example ---")
for chunk_type, chunk_content in llm.generate(
query="Write a short explanation of photosynthesis",
model="reasoning",
stream=True,
think=True
):
if chunk_type == "thinking":
# Use print with blue color escape codes since print_blue doesn't support 'end' parameter
print(f"\033[94m {chunk_content}\033[0m", end="") # Show reasoning process in blue
elif chunk_type == "content":
print(chunk_content, end="") # Show final answer
elif chunk_type == "thinking_complete":
print_green(f"\n💭 Complete reasoning available")
print() # Final newline

Loading…
Cancel
Save