|
|
|
|
@ -50,11 +50,33 @@ except ImportError: |
|
|
|
|
env_manager.set_env() |
|
|
|
|
|
|
|
|
|
tokenizer = tiktoken.get_encoding("cl100k_base") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _strip_think_tags(text: str) -> str: |
|
|
|
|
""" |
|
|
|
|
Remove <think>...</think> blocks from the text. |
|
|
|
|
|
|
|
|
|
Qwen3 (and some other models) embed their chain-of-thought reasoning |
|
|
|
|
directly in the content field wrapped in <think> tags when the |
|
|
|
|
reasoning_content API field is not used. This helper strips those |
|
|
|
|
blocks so they are never shown to end-users. |
|
|
|
|
|
|
|
|
|
Args: |
|
|
|
|
text: The raw LLM output string, possibly containing <think> blocks. |
|
|
|
|
|
|
|
|
|
Returns: |
|
|
|
|
The text with all <think>...</think> sections removed and |
|
|
|
|
surrounding whitespace trimmed. |
|
|
|
|
""" |
|
|
|
|
# re.DOTALL so that '.' matches newlines inside the thinking block |
|
|
|
|
cleaned = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL) |
|
|
|
|
return cleaned.strip() |
|
|
|
|
DEFAULT_CONTEXT_WINDOW = int(os.getenv("LLM_MAX_CONTEXT_TOKENS", "32000")) |
|
|
|
|
MODEL_CONTEXT_WINDOWS = { |
|
|
|
|
"/mnt/model_drive/models/gpt-oss-20b": int( |
|
|
|
|
os.getenv("LLM_CONTEXT_GPT_OSS_20B", "8192") |
|
|
|
|
os.getenv("LLM_CONTEXT_GPT_OSS_20B", "32768") |
|
|
|
|
), |
|
|
|
|
"qwen3-14b": int(os.getenv("LLM_CONTEXT_QWEN_14B", "32768")), |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -98,7 +120,7 @@ class LLM: |
|
|
|
|
timeout: int = 240, |
|
|
|
|
on_vpn: bool = False, |
|
|
|
|
silent: bool = False, |
|
|
|
|
presence_penalty: float = 0.6, |
|
|
|
|
presence_penalty: float = 0.3, |
|
|
|
|
top_p: float = 0.9, |
|
|
|
|
extra_body: Optional[Dict[str, Any]] = None, |
|
|
|
|
) -> None: |
|
|
|
|
@ -129,7 +151,9 @@ class LLM: |
|
|
|
|
"presence_penalty": presence_penalty, |
|
|
|
|
"top_p": top_p, |
|
|
|
|
} |
|
|
|
|
self.extra_body = extra_body if extra_body is not None else {"repetition_penalty": 1.15} |
|
|
|
|
# repetition_penalty > 1.0 scales down logits for already-seen tokens. |
|
|
|
|
# 1.2 is enough to break generation loops without hurting quality. |
|
|
|
|
self.extra_body = extra_body if extra_body is not None else {"repetition_penalty": 1.2} |
|
|
|
|
self.messages = messages or [{"role": "system", "content": self.system_message}] |
|
|
|
|
self.max_length_answer = max_length_answer |
|
|
|
|
self.chat = chat |
|
|
|
|
@ -177,7 +201,7 @@ class LLM: |
|
|
|
|
sensible defaults while still accepting fully qualified model names. |
|
|
|
|
""" |
|
|
|
|
default_model = os.getenv( |
|
|
|
|
"LLM_MODEL_VLLM", "/mnt/model_drive/models/gpt-oss-20b" |
|
|
|
|
"LLM_MODEL_VLLM", "qwen3-14b" |
|
|
|
|
) |
|
|
|
|
if model_alias in {None, "", "vllm"}: |
|
|
|
|
return default_model |
|
|
|
|
@ -479,7 +503,9 @@ class LLM: |
|
|
|
|
response: ChatCompletion = self.client.chat.completions.create( |
|
|
|
|
model=model, |
|
|
|
|
messages=self.messages, |
|
|
|
|
frequency_penalty=0.4, |
|
|
|
|
# frequency_penalty removed: repetition_penalty in extra_body |
|
|
|
|
# already handles this. Having both causes compounding effects |
|
|
|
|
# that push the model into token loops. |
|
|
|
|
stream=stream, |
|
|
|
|
temperature=options["temperature"], |
|
|
|
|
presence_penalty=options["presence_penalty"], |
|
|
|
|
@ -497,6 +523,8 @@ class LLM: |
|
|
|
|
print() |
|
|
|
|
print('TOOLS') |
|
|
|
|
print_rainbow(tools, single_line=True) |
|
|
|
|
# Re-raise the exception to inform the caller of the API failure |
|
|
|
|
raise |
|
|
|
|
|
|
|
|
|
# Try to extract backend information if available |
|
|
|
|
try: |
|
|
|
|
@ -747,6 +775,13 @@ class LLM: |
|
|
|
|
if hasattr(message, "content_text"): |
|
|
|
|
result: str = message.content_text |
|
|
|
|
|
|
|
|
|
# Qwen3 and some other models include <think>...</think> blocks directly |
|
|
|
|
# in the content field instead of (or in addition to) using reasoning_content. |
|
|
|
|
# Strip those blocks so they never leak to the user. |
|
|
|
|
if isinstance(result, str): |
|
|
|
|
result = _strip_think_tags(result) |
|
|
|
|
message.content = result |
|
|
|
|
|
|
|
|
|
# Spara i meddelandehistorik (utan verktygsanrop för ren historik) |
|
|
|
|
self.messages.append({"role": "assistant", "content": result}) |
|
|
|
|
if not self.chat: |
|
|
|
|
@ -809,6 +844,11 @@ class LLM: |
|
|
|
|
message = choice.message |
|
|
|
|
result = message.content |
|
|
|
|
|
|
|
|
|
# Strip Qwen3-style <think>...</think> blocks from content |
|
|
|
|
if isinstance(result, str): |
|
|
|
|
result = _strip_think_tags(result) |
|
|
|
|
message.content = result |
|
|
|
|
|
|
|
|
|
# Extract thinking from reasoning_content if present |
|
|
|
|
thinking_content = None |
|
|
|
|
if ( |
|
|
|
|
@ -1145,6 +1185,12 @@ if __name__ == "__main__": |
|
|
|
|
explanation: str |
|
|
|
|
|
|
|
|
|
llm = LLM() |
|
|
|
|
print('LLM URL:', llm.host_url) |
|
|
|
|
# base_url must be just the /v1 root — the OpenAI SDK appends the |
|
|
|
|
# correct path itself (/v1/responses or /v1/chat/completions). |
|
|
|
|
# Setting it to the full endpoint path causes the SDK to produce |
|
|
|
|
# invalid URLs like /v1/chat/completions/responses. |
|
|
|
|
llm.host_url = "http://192.168.1.12:8000/v1" |
|
|
|
|
response = llm.generate( |
|
|
|
|
query="""Create a simple math problem solution in JSON format with this structure: |
|
|
|
|
{ |
|
|
|
|
|