legco_ai_assistant/backend/app/services/llm_client.py

import logging
from typing import Optional

import httpx
from openai import AsyncOpenAI, APIError, APITimeoutError

from app.core.config import Settings


class LLMClientError(Exception):
    pass


class LLMClient:
    """Asynchronous LLM client using OpenAI SDK with provider-agnostic config."""

    def __init__(self, settings: Settings):
        self.settings = settings
        self.model = settings.llm_model_name
        self.enable_thinking = settings.llm_enable_thinking
        self.logger = logging.getLogger(__name__)
        self._client = AsyncOpenAI(
            base_url=settings.llm_base_url.rstrip("/"),
            api_key=settings.llm_api_key,
            timeout=settings.llm_timeout,
            http_client=httpx.AsyncClient(
                headers={"Content-Type": "application/json"},
            ),
        )

    async def complete(self, prompt: str, temperature: float = 0.7) -> str:
        """Send a chat completion request with optional thinking control."""
        messages = [{"role": "user", "content": prompt}]
        extra_body = self._build_extra_body()

        try:
            response = await self._client.chat.completions.create(
                model=self.model,
                messages=messages,
                temperature=temperature,
                extra_body=extra_body if extra_body else None,
            )
            return response.choices[0].message.content or ""
        except (APITimeoutError, APIError) as exc:
            self.logger.error("LLM API error: %s", exc)
            raise LLMClientError from exc
        except Exception as exc:
            self.logger.error("Unexpected LLM error: %s", exc)
            raise LLMClientError from exc

    def _build_extra_body(self) -> dict:
        """Build extra_body for provider-specific parameters.

        For Qwen3.5 models, disables thinking content via chat_template_kwargs
        (vLLM/SGLang convention). When thinking is enabled, no extra params
        are passed and the model uses its default thinking mode.
        """
        if self.enable_thinking:
            return {}

        # Non-thinking mode for Qwen3.5
        # Uses chat_template_kwargs for vLLM/SGLang compatibility.
        # For Alibaba Cloud Model Studio, use top-level enable_thinking instead.
        return {
            "chat_template_kwargs": {"enable_thinking": False},
            "top_k": 20,
        }

    async def close(self):
        await self._client.close()