71 lines
2.4 KiB
Python
71 lines
2.4 KiB
Python
import logging
|
|
from typing import Optional
|
|
|
|
import httpx
|
|
from openai import AsyncOpenAI, APIError, APITimeoutError
|
|
|
|
from app.core.config import Settings
|
|
|
|
|
|
class LLMClientError(Exception):
|
|
pass
|
|
|
|
|
|
class LLMClient:
|
|
"""Asynchronous LLM client using OpenAI SDK with provider-agnostic config."""
|
|
|
|
def __init__(self, settings: Settings):
|
|
self.settings = settings
|
|
self.model = settings.llm_model_name
|
|
self.enable_thinking = settings.llm_enable_thinking
|
|
self.logger = logging.getLogger(__name__)
|
|
self._client = AsyncOpenAI(
|
|
base_url=settings.llm_base_url.rstrip("/"),
|
|
api_key=settings.llm_api_key,
|
|
timeout=settings.llm_timeout,
|
|
http_client=httpx.AsyncClient(
|
|
headers={"Content-Type": "application/json"},
|
|
),
|
|
)
|
|
|
|
async def complete(self, prompt: str, temperature: float = 0.7) -> str:
|
|
"""Send a chat completion request with optional thinking control."""
|
|
messages = [{"role": "user", "content": prompt}]
|
|
extra_body = self._build_extra_body()
|
|
|
|
try:
|
|
response = await self._client.chat.completions.create(
|
|
model=self.model,
|
|
messages=messages,
|
|
temperature=temperature,
|
|
extra_body=extra_body if extra_body else None,
|
|
)
|
|
return response.choices[0].message.content or ""
|
|
except (APITimeoutError, APIError) as exc:
|
|
self.logger.error("LLM API error: %s", exc)
|
|
raise LLMClientError from exc
|
|
except Exception as exc:
|
|
self.logger.error("Unexpected LLM error: %s", exc)
|
|
raise LLMClientError from exc
|
|
|
|
def _build_extra_body(self) -> dict:
|
|
"""Build extra_body for provider-specific parameters.
|
|
|
|
For Qwen3.5 models, disables thinking content via chat_template_kwargs
|
|
(vLLM/SGLang convention). When thinking is enabled, no extra params
|
|
are passed and the model uses its default thinking mode.
|
|
"""
|
|
if self.enable_thinking:
|
|
return {}
|
|
|
|
# Non-thinking mode for Qwen3.5
|
|
# Uses chat_template_kwargs for vLLM/SGLang compatibility.
|
|
# For Alibaba Cloud Model Studio, use top-level enable_thinking instead.
|
|
return {
|
|
"chat_template_kwargs": {"enable_thinking": False},
|
|
"top_k": 20,
|
|
}
|
|
|
|
async def close(self):
|
|
await self._client.close()
|