Add _strip_think_tags function to clean LLM output and adjust model parameters

4 months ago · 1950c02bd8
parent 1170c95f22
commit 1950c02bd8
1 changed files with 51 additions and 5 deletions
--- a/_llm/llm.py
+++ b/_llm/llm.py
@ -50,11 +50,33 @@ except ImportError:
 env_manager.set_env()

 tokenizer = tiktoken.get_encoding("cl100k_base")
+
+
+def _strip_think_tags(text: str) -> str:
+    """
+    Remove <think>...</think> blocks from the text.
+
+    Qwen3 (and some other models) embed their chain-of-thought reasoning
+    directly in the content field wrapped in <think> tags when the
+    reasoning_content API field is not used.  This helper strips those
+    blocks so they are never shown to end-users.
+
+    Args:
+        text: The raw LLM output string, possibly containing <think> blocks.
+
+    Returns:
+        The text with all <think>...</think> sections removed and
+        surrounding whitespace trimmed.
+    """
+    # re.DOTALL so that '.' matches newlines inside the thinking block
+    cleaned = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
+    return cleaned.strip()
 DEFAULT_CONTEXT_WINDOW = int(os.getenv("LLM_MAX_CONTEXT_TOKENS", "32000"))
 MODEL_CONTEXT_WINDOWS = {
    "/mnt/model_drive/models/gpt-oss-20b": int(
-        os.getenv("LLM_CONTEXT_GPT_OSS_20B", "8192")
+        os.getenv("LLM_CONTEXT_GPT_OSS_20B", "32768")
    ),
+    "qwen3-14b": int(os.getenv("LLM_CONTEXT_QWEN_14B", "32768")),
 }


@ -98,7 +120,7 @@ class LLM:
        timeout: int = 240,
        on_vpn: bool = False,
        silent: bool = False,
-        presence_penalty: float = 0.6,
+        presence_penalty: float = 0.3,
        top_p: float = 0.9,
        extra_body: Optional[Dict[str, Any]] = None,
    ) -> None:
@ -129,7 +151,9 @@ class LLM:
            "presence_penalty": presence_penalty,
            "top_p": top_p,
        }
-        self.extra_body = extra_body if extra_body is not None else {"repetition_penalty": 1.15}
+        # repetition_penalty > 1.0 scales down logits for already-seen tokens.
+        # 1.2 is enough to break generation loops without hurting quality.
+        self.extra_body = extra_body if extra_body is not None else {"repetition_penalty": 1.2}
        self.messages = messages or [{"role": "system", "content": self.system_message}]
        self.max_length_answer = max_length_answer
        self.chat = chat
@ -177,7 +201,7 @@ class LLM:
        sensible defaults while still accepting fully qualified model names.
        """
        default_model = os.getenv(
-            "LLM_MODEL_VLLM", "/mnt/model_drive/models/gpt-oss-20b"
+            "LLM_MODEL_VLLM", "qwen3-14b"
        )
        if model_alias in {None, "", "vllm"}:
            return default_model
@ -479,7 +503,9 @@ class LLM:
                response: ChatCompletion = self.client.chat.completions.create(
                    model=model,
                    messages=self.messages,
-                    frequency_penalty=0.4,
+                    # frequency_penalty removed: repetition_penalty in extra_body
+                    # already handles this. Having both causes compounding effects
+                    # that push the model into token loops.
                    stream=stream,
                    temperature=options["temperature"],
                    presence_penalty=options["presence_penalty"],
@ -497,6 +523,8 @@ class LLM:
                print()
                print('TOOLS')
                print_rainbow(tools, single_line=True)
+                # Re-raise the exception to inform the caller of the API failure
+                raise

        # Try to extract backend information if available
        try:
@ -747,6 +775,13 @@ class LLM:
            if hasattr(message, "content_text"):
                result: str = message.content_text

+            # Qwen3 and some other models include <think>...</think> blocks directly
+            # in the content field instead of (or in addition to) using reasoning_content.
+            # Strip those blocks so they never leak to the user.
+            if isinstance(result, str):
+                result = _strip_think_tags(result)
+                message.content = result
+
            # Spara i meddelandehistorik (utan verktygsanrop för ren historik)
            self.messages.append({"role": "assistant", "content": result})
            if not self.chat:
@ -809,6 +844,11 @@ class LLM:
                    message = choice.message
                    result = message.content

+                    # Strip Qwen3-style <think>...</think> blocks from content
+                    if isinstance(result, str):
+                        result = _strip_think_tags(result)
+                        message.content = result
+
                    # Extract thinking from reasoning_content if present
                    thinking_content = None
                    if (
@ -1145,6 +1185,12 @@ if __name__ == "__main__":
        explanation: str

    llm = LLM()
+    print('LLM URL:', llm.host_url)
+    # base_url must be just the /v1 root — the OpenAI SDK appends the
+    # correct path itself (/v1/responses or /v1/chat/completions).
+    # Setting it to the full endpoint path causes the SDK to produce
+    # invalid URLs like /v1/chat/completions/responses.
+    llm.host_url = "http://192.168.1.12:8000/v1"
    response = llm.generate(
        query="""Create a simple math problem solution in JSON format with this structure:
        {