|
|
|
@ -5,6 +5,12 @@ import queue |
|
|
|
import threading |
|
|
|
import threading |
|
|
|
from pprint import pprint |
|
|
|
from pprint import pprint |
|
|
|
import re |
|
|
|
import re |
|
|
|
|
|
|
|
from dotenv import load_dotenv |
|
|
|
|
|
|
|
import os |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class LLM: |
|
|
|
class LLM: |
|
|
|
def __init__( |
|
|
|
def __init__( |
|
|
|
@ -28,8 +34,8 @@ class LLM: |
|
|
|
which processes requests concurrently. Defaults to False. |
|
|
|
which processes requests concurrently. Defaults to False. |
|
|
|
""" |
|
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
self.server = "192.168.1.12" |
|
|
|
self.server = os.getenv("LLM_URL") |
|
|
|
self.port = 3300 # 11440 All 4 GPU # 4500 "SW" 3300 balancer |
|
|
|
self.port = os.getenv("LLM_PORT") |
|
|
|
self.model = model |
|
|
|
self.model = model |
|
|
|
self.temperature = temperature |
|
|
|
self.temperature = temperature |
|
|
|
self.system_message = {"role": "system", "content": system_prompt} |
|
|
|
self.system_message = {"role": "system", "content": system_prompt} |
|
|
|
|