Add _strip_think_tags function to clean LLM output and adjust model parameters

main
Lasse Server 3 weeks ago
parent 1170c95f22
commit 1950c02bd8
  1. 56
      _llm/llm.py

@ -50,11 +50,33 @@ except ImportError:
env_manager.set_env()
tokenizer = tiktoken.get_encoding("cl100k_base")
def _strip_think_tags(text: str) -> str:
"""
Remove <think>...</think> blocks from the text.
Qwen3 (and some other models) embed their chain-of-thought reasoning
directly in the content field wrapped in <think> tags when the
reasoning_content API field is not used. This helper strips those
blocks so they are never shown to end-users.
Args:
text: The raw LLM output string, possibly containing <think> blocks.
Returns:
The text with all <think>...</think> sections removed and
surrounding whitespace trimmed.
"""
# re.DOTALL so that '.' matches newlines inside the thinking block
cleaned = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
return cleaned.strip()
DEFAULT_CONTEXT_WINDOW = int(os.getenv("LLM_MAX_CONTEXT_TOKENS", "32000"))
MODEL_CONTEXT_WINDOWS = {
"/mnt/model_drive/models/gpt-oss-20b": int(
os.getenv("LLM_CONTEXT_GPT_OSS_20B", "8192")
os.getenv("LLM_CONTEXT_GPT_OSS_20B", "32768")
),
"qwen3-14b": int(os.getenv("LLM_CONTEXT_QWEN_14B", "32768")),
}
@ -98,7 +120,7 @@ class LLM:
timeout: int = 240,
on_vpn: bool = False,
silent: bool = False,
presence_penalty: float = 0.6,
presence_penalty: float = 0.3,
top_p: float = 0.9,
extra_body: Optional[Dict[str, Any]] = None,
) -> None:
@ -129,7 +151,9 @@ class LLM:
"presence_penalty": presence_penalty,
"top_p": top_p,
}
self.extra_body = extra_body if extra_body is not None else {"repetition_penalty": 1.15}
# repetition_penalty > 1.0 scales down logits for already-seen tokens.
# 1.2 is enough to break generation loops without hurting quality.
self.extra_body = extra_body if extra_body is not None else {"repetition_penalty": 1.2}
self.messages = messages or [{"role": "system", "content": self.system_message}]
self.max_length_answer = max_length_answer
self.chat = chat
@ -177,7 +201,7 @@ class LLM:
sensible defaults while still accepting fully qualified model names.
"""
default_model = os.getenv(
"LLM_MODEL_VLLM", "/mnt/model_drive/models/gpt-oss-20b"
"LLM_MODEL_VLLM", "qwen3-14b"
)
if model_alias in {None, "", "vllm"}:
return default_model
@ -479,7 +503,9 @@ class LLM:
response: ChatCompletion = self.client.chat.completions.create(
model=model,
messages=self.messages,
frequency_penalty=0.4,
# frequency_penalty removed: repetition_penalty in extra_body
# already handles this. Having both causes compounding effects
# that push the model into token loops.
stream=stream,
temperature=options["temperature"],
presence_penalty=options["presence_penalty"],
@ -497,6 +523,8 @@ class LLM:
print()
print('TOOLS')
print_rainbow(tools, single_line=True)
# Re-raise the exception to inform the caller of the API failure
raise
# Try to extract backend information if available
try:
@ -747,6 +775,13 @@ class LLM:
if hasattr(message, "content_text"):
result: str = message.content_text
# Qwen3 and some other models include <think>...</think> blocks directly
# in the content field instead of (or in addition to) using reasoning_content.
# Strip those blocks so they never leak to the user.
if isinstance(result, str):
result = _strip_think_tags(result)
message.content = result
# Spara i meddelandehistorik (utan verktygsanrop för ren historik)
self.messages.append({"role": "assistant", "content": result})
if not self.chat:
@ -809,6 +844,11 @@ class LLM:
message = choice.message
result = message.content
# Strip Qwen3-style <think>...</think> blocks from content
if isinstance(result, str):
result = _strip_think_tags(result)
message.content = result
# Extract thinking from reasoning_content if present
thinking_content = None
if (
@ -1145,6 +1185,12 @@ if __name__ == "__main__":
explanation: str
llm = LLM()
print('LLM URL:', llm.host_url)
# base_url must be just the /v1 root — the OpenAI SDK appends the
# correct path itself (/v1/responses or /v1/chat/completions).
# Setting it to the full endpoint path causes the SDK to produce
# invalid URLs like /v1/chat/completions/responses.
llm.host_url = "http://192.168.1.12:8000/v1"
response = llm.generate(
query="""Create a simple math problem solution in JSON format with this structure:
{

Loading…
Cancel
Save