feat(backend): migrate LLM client to OpenAI SDK with thinking control

- Replace httpx with openai.AsyncOpenAI - Add llm_enable_thinking config (default False) - Add _build_extra_body() for Qwen3.5 thinking mode control - Use chat_template_kwargs for vLLM/SGLang compatibility Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-04-23 14:10:26 +08:00 · 2026-04-23 14:10:26 +08:00 · 74cb8b83d5
parent 2f896052a1
commit 74cb8b83d5
3 changed files with 44 additions and 21 deletions
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@ -9,6 +9,7 @@ class Settings(BaseSettings):
    llm_base_url: str = "https://openrouter.ai/api/v1"
    llm_api_key: str = ""
    llm_model_name: str = "qwen/qwen3.5-35b-a3b"
+    llm_enable_thinking: bool = False

    # Embeddings
    embedding_model: str = "qwen/qwen3-embedding-4b"
--- a/backend/app/services/llm_client.py
+++ b/backend/app/services/llm_client.py
@ -1,8 +1,8 @@
-import asyncio
 import logging
 from typing import Optional

 import httpx
+from openai import AsyncOpenAI, APIError, APITimeoutError

 from app.core.config import Settings

@ -12,38 +12,59 @@ class LLMClientError(Exception):


 class LLMClient:
-    """Asynchronous LLM HTTP client with connection pooling."""
+    """Asynchronous LLM client using OpenAI SDK with provider-agnostic config."""

    def __init__(self, settings: Settings):
        self.settings = settings
-        self.base_url = settings.llm_base_url.rstrip("/")
-        self.api_key = settings.llm_api_key
        self.model = settings.llm_model_name
+        self.enable_thinking = settings.llm_enable_thinking
        self.logger = logging.getLogger(__name__)
-        # Use a single shared AsyncClient for all requests
-        self._client = httpx.AsyncClient(
-            base_url=self.base_url,
+        self._client = AsyncOpenAI(
+            base_url=settings.llm_base_url.rstrip("/"),
+            api_key=settings.llm_api_key,
            timeout=settings.llm_timeout,
-            headers={"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"},
+            http_client=httpx.AsyncClient(
+                headers={"Content-Type": "application/json"},
+            ),
        )

    async def complete(self, prompt: str, temperature: float = 0.7) -> str:
+        """Send a chat completion request with optional thinking control."""
+        messages = [{"role": "user", "content": prompt}]
+        extra_body = self._build_extra_body()
+
        try:
-            resp = await self._client.post(
-                "/chat/completions",
-                json={
-                    "model": self.model,
-                    "messages": [{"role": "user", "content": prompt}],
-                    "temperature": temperature,
-                },
+            response = await self._client.chat.completions.create(
+                model=self.model,
+                messages=messages,
+                temperature=temperature,
+                extra_body=extra_body if extra_body else None,
            )
-            resp.raise_for_status()
-            data = resp.json()
-            return data["choices"][0]["message"]["content"]
-        except (httpx.TimeoutException, httpx.HTTPStatusError, httpx.RequestError) as exc:
+            return response.choices[0].message.content or ""
+        except (APITimeoutError, APIError) as exc:
            self.logger.error("LLM API error: %s", exc)
            raise LLMClientError from exc
+        except Exception as exc:
+            self.logger.error("Unexpected LLM error: %s", exc)
+            raise LLMClientError from exc
+
+    def _build_extra_body(self) -> dict:
+        """Build extra_body for provider-specific parameters.
+
+        For Qwen3.5 models, disables thinking content via chat_template_kwargs
+        (vLLM/SGLang convention). When thinking is enabled, no extra params
+        are passed and the model uses its default thinking mode.
+        """
+        if self.enable_thinking:
+            return {}
+
+        # Non-thinking mode for Qwen3.5
+        # Uses chat_template_kwargs for vLLM/SGLang compatibility.
+        # For Alibaba Cloud Model Studio, use top-level enable_thinking instead.
+        return {
+            "chat_template_kwargs": {"enable_thinking": False},
+            "top_k": 20,
+        }

    async def close(self):
-        if self._client:
-            await self._client.aclose()
+        await self._client.close()
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@ -7,6 +7,7 @@ python-docx==1.1.0
 pypdf==4.0.2
 python-dotenv==1.0.0
 httpx==0.26.0
+openai==1.12.0
 pytest==7.4.4
 pytest-asyncio==0.23.4
 tiktoken==0.5.2