diff --git a/_llm/llm.py b/_llm/llm.py
index 429c046..6f78725 100644
--- a/_llm/llm.py
+++ b/_llm/llm.py
@@ -50,11 +50,33 @@ except ImportError:
 env_manager.set_env()
 
 tokenizer = tiktoken.get_encoding("cl100k_base")
+
+
+def _strip_think_tags(text: str) -> str:
+    """
+    Remove <think>...</think> blocks from the text.
+
+    Qwen3 (and some other models) embed their chain-of-thought reasoning
+    directly in the content field wrapped in <think> tags when the
+    reasoning_content API field is not used.  This helper strips those
+    blocks so they are never shown to end-users.
+
+    Args:
+        text: The raw LLM output string, possibly containing <think> blocks.
+
+    Returns:
+        The text with all <think>...</think> sections removed and
+        surrounding whitespace trimmed.
+    """
+    # re.DOTALL so that '.' matches newlines inside the thinking block
+    cleaned = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
+    return cleaned.strip()
 DEFAULT_CONTEXT_WINDOW = int(os.getenv("LLM_MAX_CONTEXT_TOKENS", "32000"))
 MODEL_CONTEXT_WINDOWS = {
     "/mnt/model_drive/models/gpt-oss-20b": int(
-        os.getenv("LLM_CONTEXT_GPT_OSS_20B", "8192")
+        os.getenv("LLM_CONTEXT_GPT_OSS_20B", "32768")
     ),
+    "qwen3-14b": int(os.getenv("LLM_CONTEXT_QWEN_14B", "32768")),
 }
 
 
@@ -98,7 +120,7 @@ class LLM:
         timeout: int = 240,
         on_vpn: bool = False,
         silent: bool = False,
-        presence_penalty: float = 0.6,
+        presence_penalty: float = 0.3,
         top_p: float = 0.9,
         extra_body: Optional[Dict[str, Any]] = None,
     ) -> None:
@@ -129,7 +151,9 @@ class LLM:
             "presence_penalty": presence_penalty,
             "top_p": top_p,
         }
-        self.extra_body = extra_body if extra_body is not None else {"repetition_penalty": 1.15}
+        # repetition_penalty > 1.0 scales down logits for already-seen tokens.
+        # 1.2 is enough to break generation loops without hurting quality.
+        self.extra_body = extra_body if extra_body is not None else {"repetition_penalty": 1.2}
         self.messages = messages or [{"role": "system", "content": self.system_message}]
         self.max_length_answer = max_length_answer
         self.chat = chat
@@ -177,7 +201,7 @@ class LLM:
         sensible defaults while still accepting fully qualified model names.
         """
         default_model = os.getenv(
-            "LLM_MODEL_VLLM", "/mnt/model_drive/models/gpt-oss-20b"
+            "LLM_MODEL_VLLM", "qwen3-14b"
         )
         if model_alias in {None, "", "vllm"}:
             return default_model
@@ -479,7 +503,9 @@ class LLM:
                 response: ChatCompletion = self.client.chat.completions.create(
                     model=model,
                     messages=self.messages,
-                    frequency_penalty=0.4,
+                    # frequency_penalty removed: repetition_penalty in extra_body
+                    # already handles this. Having both causes compounding effects
+                    # that push the model into token loops.
                     stream=stream,
                     temperature=options["temperature"],
                     presence_penalty=options["presence_penalty"],
@@ -497,6 +523,8 @@ class LLM:
                 print()
                 print('TOOLS')
                 print_rainbow(tools, single_line=True)
+                # Re-raise the exception to inform the caller of the API failure
+                raise
 
         # Try to extract backend information if available
         try:
@@ -747,6 +775,13 @@ class LLM:
             if hasattr(message, "content_text"):
                 result: str = message.content_text
 
+            # Qwen3 and some other models include <think>...</think> blocks directly
+            # in the content field instead of (or in addition to) using reasoning_content.
+            # Strip those blocks so they never leak to the user.
+            if isinstance(result, str):
+                result = _strip_think_tags(result)
+                message.content = result
+
             # Spara i meddelandehistorik (utan verktygsanrop för ren historik)
             self.messages.append({"role": "assistant", "content": result})
             if not self.chat:
@@ -809,6 +844,11 @@ class LLM:
                     message = choice.message
                     result = message.content
 
+                    # Strip Qwen3-style <think>...</think> blocks from content
+                    if isinstance(result, str):
+                        result = _strip_think_tags(result)
+                        message.content = result
+
                     # Extract thinking from reasoning_content if present
                     thinking_content = None
                     if (
@@ -1145,6 +1185,12 @@ if __name__ == "__main__":
         explanation: str
 
     llm = LLM()
+    print('LLM URL:', llm.host_url)
+    # base_url must be just the /v1 root — the OpenAI SDK appends the
+    # correct path itself (/v1/responses or /v1/chat/completions).
+    # Setting it to the full endpoint path causes the SDK to produce
+    # invalid URLs like /v1/chat/completions/responses.
+    llm.host_url = "http://192.168.1.12:8000/v1"
     response = llm.generate(
         query="""Create a simple math problem solution in JSON format with this structure:
         {