diff --git a/_llm/llm.py b/_llm/llm.py
index 429c046..6f78725 100644
--- a/_llm/llm.py
+++ b/_llm/llm.py
@@ -50,11 +50,33 @@ except ImportError:
env_manager.set_env()
tokenizer = tiktoken.get_encoding("cl100k_base")
+
+
+def _strip_think_tags(text: str) -> str:
+ """
+ Remove ... blocks from the text.
+
+ Qwen3 (and some other models) embed their chain-of-thought reasoning
+ directly in the content field wrapped in tags when the
+ reasoning_content API field is not used. This helper strips those
+ blocks so they are never shown to end-users.
+
+ Args:
+ text: The raw LLM output string, possibly containing blocks.
+
+ Returns:
+ The text with all ... sections removed and
+ surrounding whitespace trimmed.
+ """
+ # re.DOTALL so that '.' matches newlines inside the thinking block
+ cleaned = re.sub(r'.*?', '', text, flags=re.DOTALL)
+ return cleaned.strip()
DEFAULT_CONTEXT_WINDOW = int(os.getenv("LLM_MAX_CONTEXT_TOKENS", "32000"))
MODEL_CONTEXT_WINDOWS = {
"/mnt/model_drive/models/gpt-oss-20b": int(
- os.getenv("LLM_CONTEXT_GPT_OSS_20B", "8192")
+ os.getenv("LLM_CONTEXT_GPT_OSS_20B", "32768")
),
+ "qwen3-14b": int(os.getenv("LLM_CONTEXT_QWEN_14B", "32768")),
}
@@ -98,7 +120,7 @@ class LLM:
timeout: int = 240,
on_vpn: bool = False,
silent: bool = False,
- presence_penalty: float = 0.6,
+ presence_penalty: float = 0.3,
top_p: float = 0.9,
extra_body: Optional[Dict[str, Any]] = None,
) -> None:
@@ -129,7 +151,9 @@ class LLM:
"presence_penalty": presence_penalty,
"top_p": top_p,
}
- self.extra_body = extra_body if extra_body is not None else {"repetition_penalty": 1.15}
+ # repetition_penalty > 1.0 scales down logits for already-seen tokens.
+ # 1.2 is enough to break generation loops without hurting quality.
+ self.extra_body = extra_body if extra_body is not None else {"repetition_penalty": 1.2}
self.messages = messages or [{"role": "system", "content": self.system_message}]
self.max_length_answer = max_length_answer
self.chat = chat
@@ -177,7 +201,7 @@ class LLM:
sensible defaults while still accepting fully qualified model names.
"""
default_model = os.getenv(
- "LLM_MODEL_VLLM", "/mnt/model_drive/models/gpt-oss-20b"
+ "LLM_MODEL_VLLM", "qwen3-14b"
)
if model_alias in {None, "", "vllm"}:
return default_model
@@ -479,7 +503,9 @@ class LLM:
response: ChatCompletion = self.client.chat.completions.create(
model=model,
messages=self.messages,
- frequency_penalty=0.4,
+ # frequency_penalty removed: repetition_penalty in extra_body
+ # already handles this. Having both causes compounding effects
+ # that push the model into token loops.
stream=stream,
temperature=options["temperature"],
presence_penalty=options["presence_penalty"],
@@ -497,6 +523,8 @@ class LLM:
print()
print('TOOLS')
print_rainbow(tools, single_line=True)
+ # Re-raise the exception to inform the caller of the API failure
+ raise
# Try to extract backend information if available
try:
@@ -747,6 +775,13 @@ class LLM:
if hasattr(message, "content_text"):
result: str = message.content_text
+ # Qwen3 and some other models include ... blocks directly
+ # in the content field instead of (or in addition to) using reasoning_content.
+ # Strip those blocks so they never leak to the user.
+ if isinstance(result, str):
+ result = _strip_think_tags(result)
+ message.content = result
+
# Spara i meddelandehistorik (utan verktygsanrop för ren historik)
self.messages.append({"role": "assistant", "content": result})
if not self.chat:
@@ -809,6 +844,11 @@ class LLM:
message = choice.message
result = message.content
+ # Strip Qwen3-style ... blocks from content
+ if isinstance(result, str):
+ result = _strip_think_tags(result)
+ message.content = result
+
# Extract thinking from reasoning_content if present
thinking_content = None
if (
@@ -1145,6 +1185,12 @@ if __name__ == "__main__":
explanation: str
llm = LLM()
+ print('LLM URL:', llm.host_url)
+ # base_url must be just the /v1 root — the OpenAI SDK appends the
+ # correct path itself (/v1/responses or /v1/chat/completions).
+ # Setting it to the full endpoint path causes the SDK to produce
+ # invalid URLs like /v1/chat/completions/responses.
+ llm.host_url = "http://192.168.1.12:8000/v1"
response = llm.generate(
query="""Create a simple math problem solution in JSON format with this structure:
{