import logging from typing import Optional import httpx from openai import AsyncOpenAI, APIError, APITimeoutError from app.core.config import Settings class LLMClientError(Exception): pass class LLMClient: """Asynchronous LLM client using OpenAI SDK with provider-agnostic config.""" def __init__(self, settings: Settings): self.settings = settings self.model = settings.llm_model_name self.enable_thinking = settings.llm_enable_thinking self.logger = logging.getLogger(__name__) self._client = AsyncOpenAI( base_url=settings.llm_base_url.rstrip("/"), api_key=settings.llm_api_key, timeout=settings.llm_timeout, http_client=httpx.AsyncClient( headers={"Content-Type": "application/json"}, ), ) async def complete(self, prompt: str, temperature: float = 0.7) -> str: """Send a chat completion request with optional thinking control.""" messages = [{"role": "user", "content": prompt}] extra_body = self._build_extra_body() try: response = await self._client.chat.completions.create( model=self.model, messages=messages, temperature=temperature, extra_body=extra_body if extra_body else None, ) return response.choices[0].message.content or "" except (APITimeoutError, APIError) as exc: self.logger.error("LLM API error: %s", exc) raise LLMClientError from exc except Exception as exc: self.logger.error("Unexpected LLM error: %s", exc) raise LLMClientError from exc def _build_extra_body(self) -> dict: """Build extra_body for provider-specific parameters. For Qwen3.5 models, disables thinking content via chat_template_kwargs (vLLM/SGLang convention). When thinking is enabled, no extra params are passed and the model uses its default thinking mode. """ if self.enable_thinking: return {} # Non-thinking mode for Qwen3.5 # Uses chat_template_kwargs for vLLM/SGLang compatibility. # For Alibaba Cloud Model Studio, use top-level enable_thinking instead. return { "chat_template_kwargs": {"enable_thinking": False}, "top_k": 20, } async def close(self): await self._client.close()