This is now vLLM only

2 months ago · 6c6be0798e
parent 238a5146f8
commit 6c6be0798e
4 changed files with 1311 additions and 1070 deletions
--- a/init.py
+++ b/init.py
@ -2,7 +2,7 @@
 llm_client: A Python package for interacting with LLM models through Ollama.
 """

-from _llm._llm.llm import LLM
+from _llm._llm.llm import LLM, ChatCompletionMessage
 from _llm._llm.tool_registry import register_tool, get_tools

-__all__ = ["LLM", "register_tool", "get_tools"]
+__all__ = ["LLM", "register_tool", "get_tools", "ChatCompletionMessage"]
--- a/_llm/llm.py
+++ b/_llm/llm.py
--- a/_llm/tests_for_llm.py
+++ b/_llm/tests_for_llm.py
@ -0,0 +1,492 @@
+
+# ------------------- TESTS ---------------------------------------------------------
+
+if __name__ == "__main__":
+    import asyncio
+    from pydantic import BaseModel
+    from typing import List
+    from _llm._llm.tool_registry import register_tool, get_tools, execute_tool
+    from _llm import LLM, ChatCompletionMessage
+    # Define structured output models
+    class CalculationStep(BaseModel):
+        step_number: int
+        description: str
+        calculation: str
+        result: float
+
+
+    class NameResponse(BaseModel):
+        name: str
+        age: int
+        occupation: str
+        hobbies: List[str]
+
+    class MathSolution(BaseModel):
+        steps: List[CalculationStep]
+        final_answer: float
+        explanation: str
+
+    llm = LLM()
+    response = llm.generate(
+        query="""Create a simple math problem solution in JSON format with this structure:
+        {
+            "problem": "the math problem",
+            "steps": ["step 1", "step 2", "step 3"],
+            "answer": "final answer"
+        }
+        
+        Problem: What is 12 * 8 + 15?""",
+        model='vllm',
+        format=MathSolution
+    )
+    print(response.content.steps)
+    exit()
+        
+
+    llm = LLM(silent=False, chat=False)  # Don't persist chat history
+    response = llm.generate("Hello! Can you introduce yourself briefly?", model='vllm', format=NameResponse)
+    print(response.__dict__)
+    response = llm.generate("What's the weather like in San Francisco? Also calculate 15 * 7 for me.", model='vllm')
+    print(response.__dict__)
+
+    # Define a tool for calculations
+    @register_tool
+    def calculate_tool(number: int, multiply_factor: int) -> int:
+        '''Multiply a number by a factor
+        Args:
+            number (int): The number to be multiplied
+            multiply_factor (int): The factor to multiply by
+        Returns:
+            int: The result of the multiplication
+        '''
+        return number * multiply_factor
+
+    
+
+    async def run_tests():
+        print("🧪 Testing LLM class with vLLM model")
+        print("=" * 50)
+        
+        # Initialize LLM instance - use fresh instance for each test
+        def get_fresh_llm():
+            return LLM(silent=False, chat=False)  # Don't persist chat history
+
+        # Test 1: Basic vLLM generation
+        print("\n1️⃣ Basic vLLM Generation Test")
+        print("-" * 30)
+        try:
+            llm = get_fresh_llm()
+            response = llm.generate(
+                query="Hello! Can you introduce yourself briefly?",
+                model='vllm'
+            )
+            print(f"✅ Basic response: {response.content[:100]}...")
+        except Exception as e:
+            print(f"❌ Basic test failed: {e}")
+
+        # Test 2: Tools usage
+        print("\n2️⃣ Tools Usage Test")
+        print("-" * 30)
+        try:
+            llm = get_fresh_llm()
+            tools = get_tools()
+            response = llm.generate(
+                query="What's the weather like in San Francisco? Also calculate 15 * 7 for me.",
+                model='vllm',
+                tools=tools
+            )
+            print(f"✅ Tools response: {response.content[:100]}...")
+            
+            # Enhanced tool call detection
+            tool_calls_found = False
+            if hasattr(response, 'tool_calls') and response.tool_calls:
+                print(f"🔧 OpenAI-style tool calls detected: {len(response.tool_calls)} calls")
+                for i, tool_call in enumerate(response.tool_calls):
+                    print(f"   Tool {i+1}: {tool_call.function.name}")
+                    print(f"   Arguments: {tool_call.function.arguments}")
+                tool_calls_found = True
+            
+            # Check if response contains JSON that might be tool-like
+            if not tool_calls_found:
+                try:
+                    import json
+                    # Try to parse the content as JSON
+                    content_json = json.loads(response.content)
+                    if isinstance(content_json, dict):
+                        print("🔧 JSON-formatted response detected (not OpenAI tool calls)")
+                        print(f"   Keys: {list(content_json.keys())}")
+                        
+                        # Check if it looks like a tool call
+                        if any(key in content_json for key in ['location', 'expression', 'function', 'name']):
+                            print("   ℹ️ This appears to be tool-like output in JSON format")
+                except json.JSONDecodeError:
+                    print("ℹ️  No structured tool calls or JSON found")
+                
+        except Exception as e:
+            print(f"❌ Tools test failed: {e}")
+
+        # Test 3: Thinking mode (use vLLM model since reasoning model doesn't exist)
+        print("\n3️⃣ Thinking Mode Test (using vllm)")
+        print("-" * 30)
+        try:
+            llm = get_fresh_llm()
+            response: ChatCompletionMessage = llm.generate(
+                query="Solve this step by step: If I have 20 apples, eat 3, give away 5, then buy 8 more, how many do I have?",
+                model='vllm',  # Use vllm instead of reasoning
+                think=True
+            )
+            print(f"✅ Thinking response: {response.content[:100]}...")
+            
+            if hasattr(response, 'reasoning_content') and response.reasoning_content:
+                print(f"🧠 Thinking content: {response.reasoning_content[:100]}...")
+            else:
+                print("ℹ️  No explicit thinking content found")
+                
+        except Exception as e:
+            print(f"❌ Thinking test failed: {e}")
+
+        # Test 4: Streaming (simplified test)
+        print("\n4️⃣ Streaming Test")
+        print("-" * 30)
+        try:
+            llm = get_fresh_llm()
+            print("Streaming response: ", end="")
+            
+            stream = llm.generate(
+                query="Explain photosynthesis in 2 sentences",
+                model='vllm',
+                stream=True
+            )
+            
+            content_parts = []
+            try:
+                for chunk_type, chunk_content in stream:
+                    if chunk_type == "content":
+                        content_parts.append(chunk_content)
+                        print(chunk_content, end="")
+                    elif chunk_type == "thinking":
+                        print(f"\033[94m{chunk_content}\033[0m", end="")  # Blue for thinking
+                
+                print(f"\n✅ Streaming completed - Content: {len(content_parts)} chunks")
+            except Exception as stream_error:
+                print(f"\n❌ Stream processing failed: {stream_error}")
+            
+        except Exception as e:
+            print(f"❌ Streaming test failed: {e}")
+
+        # Test 5: Structured output (JSON mode)
+        print("\n5️⃣ Structured Output Test")
+        print("-" * 30)
+        try:
+            llm = get_fresh_llm()
+            response = llm.generate(
+                query="""Create a simple math problem solution in JSON format with this structure:
+                {
+                    "problem": "the math problem",
+                    "steps": ["step 1", "step 2", "step 3"],
+                    "answer": "final answer"
+                }
+                
+                Problem: What is 12 * 8 + 15?""",
+                model='vllm',
+                format=MathSolution
+            )
+            print(f"✅ Structured response: {response.content[:150]}...")
+            
+            # Try to parse as JSON to verify structure
+            try:
+                import json
+                parsed = json.loads(response.content)
+                print(f"🎯 Valid JSON with keys: {list(parsed.keys())}")
+            except json.JSONDecodeError:
+                print("⚠️  Response is not valid JSON")
+                
+        except Exception as e:
+            print(f"❌ Structured output test failed: {e}")
+
+        # Test 6: Async generation
+        print("\n6️⃣ Async Generation Test")
+        print("-" * 30)
+        try:
+            llm = get_fresh_llm()
+            response = await llm.async_generate(
+                query="What's the capital of France? Answer briefly.",
+                model='vllm'
+            )
+            print(f"✅ Async response: {response.content[:100]}...")
+        except Exception as e:
+            print(f"❌ Async test failed: {e}")
+
+        # Test 7: Multiple tools with vllm (enhanced debugging)
+        print("\n7️⃣ Complex Integration Test")
+        print("-" * 30)
+        try:
+            llm = get_fresh_llm()
+            tools = get_tools()  # Get all registered tools
+            response = llm.generate(
+                query="I need to multiply 12 by 11",
+                model='vllm',
+                tools=tools,
+                think=True
+            )
+            print(f"✅ Complex response: {response.content[:100]}...")
+            
+            # Enhanced checking for both thinking and tool usage
+            has_thinking = hasattr(response, 'thinking') and response.reasoning_content
+            has_tool_calls = hasattr(response, 'tool_calls') and response.tool_calls
+            
+            print(f"🧠 Has thinking: {has_thinking}")
+            if has_thinking:
+                print(f"   Thinking content: {response.reasoning_content[:50]}...")
+            
+            print(f"🔧 Has OpenAI tool calls: {has_tool_calls}")
+            if has_tool_calls:
+                print(f"   Tool calls count: {len(response.tool_calls)}")
+                for i, tool_call in enumerate(response.tool_calls):
+                    print(f"   Tool {i+1}: {tool_call.function.name}")
+            
+            # Check for JSON-style tool responses
+            try:
+                import json
+                content_json = json.loads(response.content)
+                if isinstance(content_json, dict) and any(key in content_json for key in ['expression', 'calculation', 'result']):
+                    print("🔧 JSON-style tool response detected:")
+                    print(f"   Content: {content_json}")
+            except json.JSONDecodeError:
+                pass
+            
+        except Exception as e:
+            print(f"❌ Complex test failed: {e}")
+
+        # New Test 8: Tool Call Format Analysis
+        print("\n8️⃣ Tool Call Format Analysis")
+        print("-" * 30)
+        try:
+            llm = get_fresh_llm()
+            tools = get_tools()
+            
+            # Test with explicit tool instruction
+            response = llm.generate(
+                query="Use the calculate tool to compute 25 * 4. Make sure to call the function.",
+                model='vllm',
+                tools=tools
+            )
+            
+            print(f"Response content: {response.content}")
+            print(f"Response type: {type(response)}")
+            print(f"Has tool_calls attribute: {hasattr(response, 'tool_calls')}")
+            
+            if hasattr(response, 'tool_calls') and response.tool_calls:
+                print(f"Tool calls count: {len(response.tool_calls)}")
+                print(f"Tool calls type: {type(response.tool_calls)}")
+                
+                for i, tool_call in enumerate(response.tool_calls):
+                    print(f"Tool {i+1}:")
+                    print(f"  ID: {tool_call.id}")
+                    print(f"  Type: {tool_call.type}")
+                    print(f"  Function name: {tool_call.function.name}")
+                    print(f"  Function arguments: {tool_call.function.arguments}")
+            else:
+                print("No tool calls found")
+            
+        except Exception as e:
+            print(f"❌ Tool format analysis failed: {e}")
+
+        # New Test 9: vLLM Tool Response Conversion Test
+        print("\n9️⃣ vLLM Tool Response Conversion Test")
+        print("-" * 30)
+        try:
+            llm = get_fresh_llm()
+            tools = get_tools()  # Get all registered tools
+            
+            # Test multiple tool scenarios
+            test_cases = [
+                "Calculate 15 * 7 using the calculate tool",
+                "Get weather for New York using the weather tool", 
+                "Use both tools: calculate 20 + 5 and get weather for London"
+            ]
+            
+            for i, test_query in enumerate(test_cases, 1):
+                print(f"\n  Test {i}: {test_query}")
+                response = llm.generate(
+                    query=test_query,
+                    model='vllm',
+                    tools=tools
+                )
+                
+                print(f"  Response: {response.content[:60]}...")
+                
+                if hasattr(response, 'tool_calls') and response.tool_calls:
+                    print(f"  ✅ Converted to {len(response.tool_calls)} tool call(s)")
+                    for j, tool_call in enumerate(response.tool_calls):
+                        print(f"    Tool {j+1}: {tool_call.function.name}")
+                else:
+                    print("  ⚠️  No tool calls detected")
+            
+        except Exception as e:
+            print(f"❌ vLLM conversion test failed: {e}")
+
+        print("\n" + "=" * 50)
+        print("🏁 Test suite completed!")
+
+    # Helper function for non-async testing
+    def translate_to_spanish(text):
+        llm = LLM()
+        prompt = f"Translate the following text to Spanish:\n\n{text}"
+        response = llm.generate(query=prompt, model='vllm')
+        return response.content
+
+    # Run the test suite
+    print("Starting comprehensive test suite...")
+    asyncio.run(run_tests())
+    
+    # Quick translation test
+    print("\n🌍 Translation Test:")
+    spanish_text = translate_to_spanish("Hello, how are you today?")
+    print(f"Spanish translation: {spanish_text}")
+    # Quick translation test
+    print("\n🌍 Translation Test:")
+    spanish_text = translate_to_spanish("Hello, how are you today?")
+        print("\n6️⃣ Async Generation Test")
+        print("-" * 30)
+        try:
+            llm = get_fresh_llm()
+            response = await llm.async_generate(
+                query="What's the capital of France? Answer briefly.",
+                model='vllm'
+            )
+            print(f"✅ Async response: {response.content[:100]}...")
+        except Exception as e:
+            print(f"❌ Async test failed: {e}")
+
+        # Test 7: Multiple tools with vllm (enhanced debugging)
+        print("\n7️⃣ Complex Integration Test")
+        print("-" * 30)
+        try:
+            llm = get_fresh_llm()
+            tools = get_tools()  # Get all registered tools
+            response = llm.generate(
+                query="I need to multiply 12 by 11",
+                model='vllm',
+                tools=tools,
+                think=True
+            )
+            print(f"✅ Complex response: {response.content[:100]}...")
+            
+            # Enhanced checking for both thinking and tool usage
+            has_thinking = hasattr(response, 'thinking') and response.reasoning_content
+            has_tool_calls = hasattr(response, 'tool_calls') and response.tool_calls
+            
+            print(f"🧠 Has thinking: {has_thinking}")
+            if has_thinking:
+                print(f"   Thinking content: {response.reasoning_content[:50]}...")
+            
+            print(f"🔧 Has OpenAI tool calls: {has_tool_calls}")
+            if has_tool_calls:
+                print(f"   Tool calls count: {len(response.tool_calls)}")
+                for i, tool_call in enumerate(response.tool_calls):
+                    print(f"   Tool {i+1}: {tool_call.function.name}")
+            
+            # Check for JSON-style tool responses
+            try:
+                import json
+                content_json = json.loads(response.content)
+                if isinstance(content_json, dict) and any(key in content_json for key in ['expression', 'calculation', 'result']):
+                    print("🔧 JSON-style tool response detected:")
+                    print(f"   Content: {content_json}")
+            except json.JSONDecodeError:
+                pass
+            
+        except Exception as e:
+            print(f"❌ Complex test failed: {e}")
+
+        # New Test 8: Tool Call Format Analysis
+        print("\n8️⃣ Tool Call Format Analysis")
+        print("-" * 30)
+        try:
+            llm = get_fresh_llm()
+            tools = get_tools()
+            
+            # Test with explicit tool instruction
+            response = llm.generate(
+                query="Use the calculate tool to compute 25 * 4. Make sure to call the function.",
+                model='vllm',
+                tools=tools
+            )
+            
+            print(f"Response content: {response.content}")
+            print(f"Response type: {type(response)}")
+            print(f"Has tool_calls attribute: {hasattr(response, 'tool_calls')}")
+            
+            if hasattr(response, 'tool_calls') and response.tool_calls:
+                print(f"Tool calls count: {len(response.tool_calls)}")
+                print(f"Tool calls type: {type(response.tool_calls)}")
+                
+                for i, tool_call in enumerate(response.tool_calls):
+                    print(f"Tool {i+1}:")
+                    print(f"  ID: {tool_call.id}")
+                    print(f"  Type: {tool_call.type}")
+                    print(f"  Function name: {tool_call.function.name}")
+                    print(f"  Function arguments: {tool_call.function.arguments}")
+            else:
+                print("No tool calls found")
+            
+        except Exception as e:
+            print(f"❌ Tool format analysis failed: {e}")
+
+        # New Test 9: vLLM Tool Response Conversion Test
+        print("\n9️⃣ vLLM Tool Response Conversion Test")
+        print("-" * 30)
+        try:
+            llm = get_fresh_llm()
+            tools = get_tools()  # Get all registered tools
+            
+            # Test multiple tool scenarios
+            test_cases = [
+                "Calculate 15 * 7 using the calculate tool",
+                "Get weather for New York using the weather tool", 
+                "Use both tools: calculate 20 + 5 and get weather for London"
+            ]
+            
+            for i, test_query in enumerate(test_cases, 1):
+                print(f"\n  Test {i}: {test_query}")
+                response = llm.generate(
+                    query=test_query,
+                    model='vllm',
+                    tools=tools
+                )
+                
+                print(f"  Response: {response.content[:60]}...")
+                
+                if hasattr(response, 'tool_calls') and response.tool_calls:
+                    print(f"  ✅ Converted to {len(response.tool_calls)} tool call(s)")
+                    for j, tool_call in enumerate(response.tool_calls):
+                        print(f"    Tool {j+1}: {tool_call.function.name}")
+                else:
+                    print("  ⚠️  No tool calls detected")
+            
+        except Exception as e:
+            print(f"❌ vLLM conversion test failed: {e}")
+
+        print("\n" + "=" * 50)
+        print("🏁 Test suite completed!")
+
+    # Helper function for non-async testing
+    def translate_to_spanish(text):
+        llm = LLM()
+        prompt = f"Translate the following text to Spanish:\n\n{text}"
+        response = llm.generate(query=prompt, model='vllm')
+        return response.content
+
+    # Run the test suite
+    print("Starting comprehensive test suite...")
+    asyncio.run(run_tests())
+    
+    # Quick translation test
+    print("\n🌍 Translation Test:")
+    spanish_text = translate_to_spanish("Hello, how are you today?")
+    print(f"Spanish translation: {spanish_text}")
+    # Quick translation test
+    print("\n🌍 Translation Test:")
+    spanish_text = translate_to_spanish("Hello, how are you today?")
+    
--- a/_llm/tool_registry.py
+++ b/_llm/tool_registry.py
@ -24,39 +24,82 @@ def _pytype_to_jsonschema(t):
    }
    return mapping.get(t, {"type": "string"})

-# --- docstring parser (Google style) ---
+# --- docstring parser (Google style) - FIXED VERSION ---
 def _parse_google_docstring(docstring: str):
    if not docstring:
        return {"description": "", "params": {}}
+    
    lines = [ln.rstrip() for ln in docstring.splitlines()]
+    
+    # Find where Args/Arguments section starts
+    args_start = None
+    for i, line in enumerate(lines):
+        if line.strip().lower() in ("args:", "arguments:"):
+            args_start = i
+            break
+    
+    # Find where Args section ends (Returns:, Raises:, or another section)
+    args_end = len(lines)
+    if args_start is not None:
+        for i in range(args_start + 1, len(lines)):
+            line = lines[i].strip().lower()
+            if line.endswith(':') and line.rstrip(':') in ('returns', 'return', 'raises', 'raise', 'yields', 'yield', 'examples', 'example', 'notes', 'note'):
+                args_end = i
+                break
+    
+    # Build description from everything EXCEPT the Args section content
    desc_lines = []
-    i = 0
-    while i < len(lines) and not lines[i].lower().startswith(("args:", "arguments:")):
-        if lines[i].strip():
-            desc_lines.append(lines[i].strip())
-        i += 1
+    
+    # Before Args
+    if args_start is not None:
+        for i in range(args_start):
+            if lines[i].strip():
+                desc_lines.append(lines[i].strip())
+    else:
+        # No Args section, include everything
+        for line in lines:
+            if line.strip():
+                desc_lines.append(line.strip())
+    
+    # After Args section (Returns, examples, etc.)
+    if args_start is not None and args_end < len(lines):
+        for i in range(args_end, len(lines)):
+            if lines[i].strip():
+                desc_lines.append(lines[i].strip())
+    
    description = " ".join(desc_lines).strip()
+    
+    # Parse parameters from Args section
    params = {}
-    if i < len(lines):
-        i += 1
-        while i < len(lines):
+    if args_start is not None:
+        i = args_start + 1
+        while i < args_end:
            line = lines[i].strip()
            if not line:
                i += 1
                continue
+            
+            # Match parameter line: "param_name (type): description" or "param_name: description"
            m = re.match(r'^(\w+)\s*(?:\(([^)]+)\))?\s*:\s*(.*)$', line)
            if m:
                name = m.group(1)
                desc = m.group(3)
+                
+                # Collect continuation lines for this parameter
                j = i + 1
-                while j < len(lines) and not re.match(r'^\w+\s*(?:\([^)]+\))?\s*:', lines[j].strip()):
-                    if lines[j].strip():
-                        desc += " " + lines[j].strip()
+                while j < args_end:
+                    next_line = lines[j].strip()
+                    # Check if it's a new parameter or empty
+                    if not next_line or re.match(r'^\w+\s*(?:\([^)]+\))?\s*:', next_line):
+                        break
+                    desc += " " + next_line
                    j += 1
+                
                params[name] = {"description": desc.strip(), "type": m.group(2)}
                i = j
                continue
            i += 1
+    
    return {"description": description, "params": params}

 # --- helper: make OpenAI-style function spec ---
@ -109,9 +152,26 @@ def register_tool(func: Callable = None, *, name: str = None, description: str =
        return _register(func)

 # --- what to send to model ---
-def get_tools() -> List[dict]:
+def get_tools(specific_tools: list[str] = False, exclude_tools: list[str]= False) -> List[dict]:
    """Return OpenAI-compatible functions list with proper 'function' wrapper."""
-    return [entry["schema"] for entry in TOOL_REGISTRY.values()]
+    assert not (specific_tools and exclude_tools), "Cannot specify both specific_tools and exclude_tools"
+    if isinstance(specific_tools, str):
+        specific_tools = [specific_tools]
+    if specific_tools:
+        # Returned named tools only
+        result = []
+        for t in specific_tools:
+            entry = TOOL_REGISTRY.get(t)
+            if entry:
+                result.append(entry["schema"])
+
+    elif exclude_tools:
+        all_tools = [entry["schema"] for entry in TOOL_REGISTRY.values()]
+        result = [t for t in all_tools if t["function"]["name"] not in exclude_tools]
+    else:
+        # Return all registered tools
+        result = [entry["schema"] for entry in TOOL_REGISTRY.values()]
+    return result

 # --- robust parser for arguments ---
 def parse_function_call_arguments(raw) -> dict:
@ -173,4 +233,4 @@ def execute_tool(name: str, args: dict):
        else:
            kwargs[pname] = val
    result = fn(**kwargs)
-    return result
+    return result