feat(backend): migrate LLM client to OpenAI SDK with thinking control
- Replace httpx with openai.AsyncOpenAI - Add llm_enable_thinking config (default False) - Add _build_extra_body() for Qwen3.5 thinking mode control - Use chat_template_kwargs for vLLM/SGLang compatibility Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
2f896052a1
commit
74cb8b83d5
|
|
@ -9,6 +9,7 @@ class Settings(BaseSettings):
|
||||||
llm_base_url: str = "https://openrouter.ai/api/v1"
|
llm_base_url: str = "https://openrouter.ai/api/v1"
|
||||||
llm_api_key: str = ""
|
llm_api_key: str = ""
|
||||||
llm_model_name: str = "qwen/qwen3.5-35b-a3b"
|
llm_model_name: str = "qwen/qwen3.5-35b-a3b"
|
||||||
|
llm_enable_thinking: bool = False
|
||||||
|
|
||||||
# Embeddings
|
# Embeddings
|
||||||
embedding_model: str = "qwen/qwen3-embedding-4b"
|
embedding_model: str = "qwen/qwen3-embedding-4b"
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,8 @@
|
||||||
import asyncio
|
|
||||||
import logging
|
import logging
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
from openai import AsyncOpenAI, APIError, APITimeoutError
|
||||||
|
|
||||||
from app.core.config import Settings
|
from app.core.config import Settings
|
||||||
|
|
||||||
|
|
@ -12,38 +12,59 @@ class LLMClientError(Exception):
|
||||||
|
|
||||||
|
|
||||||
class LLMClient:
|
class LLMClient:
|
||||||
"""Asynchronous LLM HTTP client with connection pooling."""
|
"""Asynchronous LLM client using OpenAI SDK with provider-agnostic config."""
|
||||||
|
|
||||||
def __init__(self, settings: Settings):
|
def __init__(self, settings: Settings):
|
||||||
self.settings = settings
|
self.settings = settings
|
||||||
self.base_url = settings.llm_base_url.rstrip("/")
|
|
||||||
self.api_key = settings.llm_api_key
|
|
||||||
self.model = settings.llm_model_name
|
self.model = settings.llm_model_name
|
||||||
|
self.enable_thinking = settings.llm_enable_thinking
|
||||||
self.logger = logging.getLogger(__name__)
|
self.logger = logging.getLogger(__name__)
|
||||||
# Use a single shared AsyncClient for all requests
|
self._client = AsyncOpenAI(
|
||||||
self._client = httpx.AsyncClient(
|
base_url=settings.llm_base_url.rstrip("/"),
|
||||||
base_url=self.base_url,
|
api_key=settings.llm_api_key,
|
||||||
timeout=settings.llm_timeout,
|
timeout=settings.llm_timeout,
|
||||||
headers={"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"},
|
http_client=httpx.AsyncClient(
|
||||||
|
headers={"Content-Type": "application/json"},
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
async def complete(self, prompt: str, temperature: float = 0.7) -> str:
|
async def complete(self, prompt: str, temperature: float = 0.7) -> str:
|
||||||
|
"""Send a chat completion request with optional thinking control."""
|
||||||
|
messages = [{"role": "user", "content": prompt}]
|
||||||
|
extra_body = self._build_extra_body()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
resp = await self._client.post(
|
response = await self._client.chat.completions.create(
|
||||||
"/chat/completions",
|
model=self.model,
|
||||||
json={
|
messages=messages,
|
||||||
"model": self.model,
|
temperature=temperature,
|
||||||
"messages": [{"role": "user", "content": prompt}],
|
extra_body=extra_body if extra_body else None,
|
||||||
"temperature": temperature,
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
resp.raise_for_status()
|
return response.choices[0].message.content or ""
|
||||||
data = resp.json()
|
except (APITimeoutError, APIError) as exc:
|
||||||
return data["choices"][0]["message"]["content"]
|
|
||||||
except (httpx.TimeoutException, httpx.HTTPStatusError, httpx.RequestError) as exc:
|
|
||||||
self.logger.error("LLM API error: %s", exc)
|
self.logger.error("LLM API error: %s", exc)
|
||||||
raise LLMClientError from exc
|
raise LLMClientError from exc
|
||||||
|
except Exception as exc:
|
||||||
|
self.logger.error("Unexpected LLM error: %s", exc)
|
||||||
|
raise LLMClientError from exc
|
||||||
|
|
||||||
|
def _build_extra_body(self) -> dict:
|
||||||
|
"""Build extra_body for provider-specific parameters.
|
||||||
|
|
||||||
|
For Qwen3.5 models, disables thinking content via chat_template_kwargs
|
||||||
|
(vLLM/SGLang convention). When thinking is enabled, no extra params
|
||||||
|
are passed and the model uses its default thinking mode.
|
||||||
|
"""
|
||||||
|
if self.enable_thinking:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Non-thinking mode for Qwen3.5
|
||||||
|
# Uses chat_template_kwargs for vLLM/SGLang compatibility.
|
||||||
|
# For Alibaba Cloud Model Studio, use top-level enable_thinking instead.
|
||||||
|
return {
|
||||||
|
"chat_template_kwargs": {"enable_thinking": False},
|
||||||
|
"top_k": 20,
|
||||||
|
}
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
if self._client:
|
await self._client.close()
|
||||||
await self._client.aclose()
|
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ python-docx==1.1.0
|
||||||
pypdf==4.0.2
|
pypdf==4.0.2
|
||||||
python-dotenv==1.0.0
|
python-dotenv==1.0.0
|
||||||
httpx==0.26.0
|
httpx==0.26.0
|
||||||
|
openai==1.12.0
|
||||||
pytest==7.4.4
|
pytest==7.4.4
|
||||||
pytest-asyncio==0.23.4
|
pytest-asyncio==0.23.4
|
||||||
tiktoken==0.5.2
|
tiktoken==0.5.2
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue