diff --git a/_llm/llm.py b/_llm/llm.py index 429c046..6f78725 100644 --- a/_llm/llm.py +++ b/_llm/llm.py @@ -50,11 +50,33 @@ except ImportError: env_manager.set_env() tokenizer = tiktoken.get_encoding("cl100k_base") + + +def _strip_think_tags(text: str) -> str: + """ + Remove ... blocks from the text. + + Qwen3 (and some other models) embed their chain-of-thought reasoning + directly in the content field wrapped in tags when the + reasoning_content API field is not used. This helper strips those + blocks so they are never shown to end-users. + + Args: + text: The raw LLM output string, possibly containing blocks. + + Returns: + The text with all ... sections removed and + surrounding whitespace trimmed. + """ + # re.DOTALL so that '.' matches newlines inside the thinking block + cleaned = re.sub(r'.*?', '', text, flags=re.DOTALL) + return cleaned.strip() DEFAULT_CONTEXT_WINDOW = int(os.getenv("LLM_MAX_CONTEXT_TOKENS", "32000")) MODEL_CONTEXT_WINDOWS = { "/mnt/model_drive/models/gpt-oss-20b": int( - os.getenv("LLM_CONTEXT_GPT_OSS_20B", "8192") + os.getenv("LLM_CONTEXT_GPT_OSS_20B", "32768") ), + "qwen3-14b": int(os.getenv("LLM_CONTEXT_QWEN_14B", "32768")), } @@ -98,7 +120,7 @@ class LLM: timeout: int = 240, on_vpn: bool = False, silent: bool = False, - presence_penalty: float = 0.6, + presence_penalty: float = 0.3, top_p: float = 0.9, extra_body: Optional[Dict[str, Any]] = None, ) -> None: @@ -129,7 +151,9 @@ class LLM: "presence_penalty": presence_penalty, "top_p": top_p, } - self.extra_body = extra_body if extra_body is not None else {"repetition_penalty": 1.15} + # repetition_penalty > 1.0 scales down logits for already-seen tokens. + # 1.2 is enough to break generation loops without hurting quality. + self.extra_body = extra_body if extra_body is not None else {"repetition_penalty": 1.2} self.messages = messages or [{"role": "system", "content": self.system_message}] self.max_length_answer = max_length_answer self.chat = chat @@ -177,7 +201,7 @@ class LLM: sensible defaults while still accepting fully qualified model names. """ default_model = os.getenv( - "LLM_MODEL_VLLM", "/mnt/model_drive/models/gpt-oss-20b" + "LLM_MODEL_VLLM", "qwen3-14b" ) if model_alias in {None, "", "vllm"}: return default_model @@ -479,7 +503,9 @@ class LLM: response: ChatCompletion = self.client.chat.completions.create( model=model, messages=self.messages, - frequency_penalty=0.4, + # frequency_penalty removed: repetition_penalty in extra_body + # already handles this. Having both causes compounding effects + # that push the model into token loops. stream=stream, temperature=options["temperature"], presence_penalty=options["presence_penalty"], @@ -497,6 +523,8 @@ class LLM: print() print('TOOLS') print_rainbow(tools, single_line=True) + # Re-raise the exception to inform the caller of the API failure + raise # Try to extract backend information if available try: @@ -747,6 +775,13 @@ class LLM: if hasattr(message, "content_text"): result: str = message.content_text + # Qwen3 and some other models include ... blocks directly + # in the content field instead of (or in addition to) using reasoning_content. + # Strip those blocks so they never leak to the user. + if isinstance(result, str): + result = _strip_think_tags(result) + message.content = result + # Spara i meddelandehistorik (utan verktygsanrop för ren historik) self.messages.append({"role": "assistant", "content": result}) if not self.chat: @@ -809,6 +844,11 @@ class LLM: message = choice.message result = message.content + # Strip Qwen3-style ... blocks from content + if isinstance(result, str): + result = _strip_think_tags(result) + message.content = result + # Extract thinking from reasoning_content if present thinking_content = None if ( @@ -1145,6 +1185,12 @@ if __name__ == "__main__": explanation: str llm = LLM() + print('LLM URL:', llm.host_url) + # base_url must be just the /v1 root — the OpenAI SDK appends the + # correct path itself (/v1/responses or /v1/chat/completions). + # Setting it to the full endpoint path causes the SDK to produce + # invalid URLs like /v1/chat/completions/responses. + llm.host_url = "http://192.168.1.12:8000/v1" response = llm.generate( query="""Create a simple math problem solution in JSON format with this structure: {