feat(backend): migrate LLM client to OpenAI SDK with thinking control

- Replace httpx with openai.AsyncOpenAI

- Add llm_enable_thinking config (default False)

- Add _build_extra_body() for Qwen3.5 thinking mode control

- Use chat_template_kwargs for vLLM/SGLang compatibility

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
Woody 2026-04-23 14:10:26 +08:00
parent 2f896052a1
commit 74cb8b83d5
3 changed files with 44 additions and 21 deletions

View File

@ -9,6 +9,7 @@ class Settings(BaseSettings):
llm_base_url: str = "https://openrouter.ai/api/v1" llm_base_url: str = "https://openrouter.ai/api/v1"
llm_api_key: str = "" llm_api_key: str = ""
llm_model_name: str = "qwen/qwen3.5-35b-a3b" llm_model_name: str = "qwen/qwen3.5-35b-a3b"
llm_enable_thinking: bool = False
# Embeddings # Embeddings
embedding_model: str = "qwen/qwen3-embedding-4b" embedding_model: str = "qwen/qwen3-embedding-4b"

View File

@ -1,8 +1,8 @@
import asyncio
import logging import logging
from typing import Optional from typing import Optional
import httpx import httpx
from openai import AsyncOpenAI, APIError, APITimeoutError
from app.core.config import Settings from app.core.config import Settings
@ -12,38 +12,59 @@ class LLMClientError(Exception):
class LLMClient: class LLMClient:
"""Asynchronous LLM HTTP client with connection pooling.""" """Asynchronous LLM client using OpenAI SDK with provider-agnostic config."""
def __init__(self, settings: Settings): def __init__(self, settings: Settings):
self.settings = settings self.settings = settings
self.base_url = settings.llm_base_url.rstrip("/")
self.api_key = settings.llm_api_key
self.model = settings.llm_model_name self.model = settings.llm_model_name
self.enable_thinking = settings.llm_enable_thinking
self.logger = logging.getLogger(__name__) self.logger = logging.getLogger(__name__)
# Use a single shared AsyncClient for all requests self._client = AsyncOpenAI(
self._client = httpx.AsyncClient( base_url=settings.llm_base_url.rstrip("/"),
base_url=self.base_url, api_key=settings.llm_api_key,
timeout=settings.llm_timeout, timeout=settings.llm_timeout,
headers={"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}, http_client=httpx.AsyncClient(
headers={"Content-Type": "application/json"},
),
) )
async def complete(self, prompt: str, temperature: float = 0.7) -> str: async def complete(self, prompt: str, temperature: float = 0.7) -> str:
"""Send a chat completion request with optional thinking control."""
messages = [{"role": "user", "content": prompt}]
extra_body = self._build_extra_body()
try: try:
resp = await self._client.post( response = await self._client.chat.completions.create(
"/chat/completions", model=self.model,
json={ messages=messages,
"model": self.model, temperature=temperature,
"messages": [{"role": "user", "content": prompt}], extra_body=extra_body if extra_body else None,
"temperature": temperature,
},
) )
resp.raise_for_status() return response.choices[0].message.content or ""
data = resp.json() except (APITimeoutError, APIError) as exc:
return data["choices"][0]["message"]["content"]
except (httpx.TimeoutException, httpx.HTTPStatusError, httpx.RequestError) as exc:
self.logger.error("LLM API error: %s", exc) self.logger.error("LLM API error: %s", exc)
raise LLMClientError from exc raise LLMClientError from exc
except Exception as exc:
self.logger.error("Unexpected LLM error: %s", exc)
raise LLMClientError from exc
def _build_extra_body(self) -> dict:
"""Build extra_body for provider-specific parameters.
For Qwen3.5 models, disables thinking content via chat_template_kwargs
(vLLM/SGLang convention). When thinking is enabled, no extra params
are passed and the model uses its default thinking mode.
"""
if self.enable_thinking:
return {}
# Non-thinking mode for Qwen3.5
# Uses chat_template_kwargs for vLLM/SGLang compatibility.
# For Alibaba Cloud Model Studio, use top-level enable_thinking instead.
return {
"chat_template_kwargs": {"enable_thinking": False},
"top_k": 20,
}
async def close(self): async def close(self):
if self._client: await self._client.close()
await self._client.aclose()

View File

@ -7,6 +7,7 @@ python-docx==1.1.0
pypdf==4.0.2 pypdf==4.0.2
python-dotenv==1.0.0 python-dotenv==1.0.0
httpx==0.26.0 httpx==0.26.0
openai==1.12.0
pytest==7.4.4 pytest==7.4.4
pytest-asyncio==0.23.4 pytest-asyncio==0.23.4
tiktoken==0.5.2 tiktoken==0.5.2