diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 458cc14..a4e7019 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -9,6 +9,7 @@ class Settings(BaseSettings): llm_base_url: str = "https://openrouter.ai/api/v1" llm_api_key: str = "" llm_model_name: str = "qwen/qwen3.5-35b-a3b" + llm_enable_thinking: bool = False # Embeddings embedding_model: str = "qwen/qwen3-embedding-4b" diff --git a/backend/app/services/llm_client.py b/backend/app/services/llm_client.py index f91c6d7..ef8a30b 100644 --- a/backend/app/services/llm_client.py +++ b/backend/app/services/llm_client.py @@ -1,8 +1,8 @@ -import asyncio import logging from typing import Optional import httpx +from openai import AsyncOpenAI, APIError, APITimeoutError from app.core.config import Settings @@ -12,38 +12,59 @@ class LLMClientError(Exception): class LLMClient: - """Asynchronous LLM HTTP client with connection pooling.""" + """Asynchronous LLM client using OpenAI SDK with provider-agnostic config.""" def __init__(self, settings: Settings): self.settings = settings - self.base_url = settings.llm_base_url.rstrip("/") - self.api_key = settings.llm_api_key self.model = settings.llm_model_name + self.enable_thinking = settings.llm_enable_thinking self.logger = logging.getLogger(__name__) - # Use a single shared AsyncClient for all requests - self._client = httpx.AsyncClient( - base_url=self.base_url, + self._client = AsyncOpenAI( + base_url=settings.llm_base_url.rstrip("/"), + api_key=settings.llm_api_key, timeout=settings.llm_timeout, - headers={"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}, + http_client=httpx.AsyncClient( + headers={"Content-Type": "application/json"}, + ), ) async def complete(self, prompt: str, temperature: float = 0.7) -> str: + """Send a chat completion request with optional thinking control.""" + messages = [{"role": "user", "content": prompt}] + extra_body = self._build_extra_body() + try: - resp = await self._client.post( - "/chat/completions", - json={ - "model": self.model, - "messages": [{"role": "user", "content": prompt}], - "temperature": temperature, - }, + response = await self._client.chat.completions.create( + model=self.model, + messages=messages, + temperature=temperature, + extra_body=extra_body if extra_body else None, ) - resp.raise_for_status() - data = resp.json() - return data["choices"][0]["message"]["content"] - except (httpx.TimeoutException, httpx.HTTPStatusError, httpx.RequestError) as exc: + return response.choices[0].message.content or "" + except (APITimeoutError, APIError) as exc: self.logger.error("LLM API error: %s", exc) raise LLMClientError from exc + except Exception as exc: + self.logger.error("Unexpected LLM error: %s", exc) + raise LLMClientError from exc + + def _build_extra_body(self) -> dict: + """Build extra_body for provider-specific parameters. + + For Qwen3.5 models, disables thinking content via chat_template_kwargs + (vLLM/SGLang convention). When thinking is enabled, no extra params + are passed and the model uses its default thinking mode. + """ + if self.enable_thinking: + return {} + + # Non-thinking mode for Qwen3.5 + # Uses chat_template_kwargs for vLLM/SGLang compatibility. + # For Alibaba Cloud Model Studio, use top-level enable_thinking instead. + return { + "chat_template_kwargs": {"enable_thinking": False}, + "top_k": 20, + } async def close(self): - if self._client: - await self._client.aclose() + await self._client.close() diff --git a/backend/requirements.txt b/backend/requirements.txt index c5599fd..7a5e7b5 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -7,6 +7,7 @@ python-docx==1.1.0 pypdf==4.0.2 python-dotenv==1.0.0 httpx==0.26.0 +openai==1.12.0 pytest==7.4.4 pytest-asyncio==0.23.4 tiktoken==0.5.2