diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 34e508c..6477f04 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -13,6 +13,7 @@ class Settings(BaseSettings): llm_api_key: str = "" llm_model_name: str = "qwen/qwen3.5-35b-a3b" llm_enable_thinking: bool = False + vllm_engine: bool = False # Embeddings embedding_model: str = "qwen/qwen3-embedding-4b" diff --git a/backend/app/services/llm_client.py b/backend/app/services/llm_client.py index 6069367..0ea3263 100644 --- a/backend/app/services/llm_client.py +++ b/backend/app/services/llm_client.py @@ -83,20 +83,19 @@ class LLMClient: def _build_extra_body(self) -> dict: """Build extra_body for provider-specific parameters. - For Qwen3.5 models, disables thinking content via chat_template_kwargs - (vLLM/SGLang convention). When thinking is enabled, no extra params - are passed and the model uses its default thinking mode. + When thinking is enabled, no extra params are passed + and the model uses its default thinking mode. + + vLLM: {"chat_template_kwargs": {"enable_thinking": False}} + OpenRouter: {"reasoning": {"enabled": False}} """ if self.enable_thinking: return {} - # Non-thinking mode for Qwen3.5 - # Uses chat_template_kwargs for vLLM/SGLang compatibility. - # For Alibaba Cloud Model Studio, use top-level enable_thinking instead. - # return { - # "chat_template_kwargs": {"enable_thinking": False}, - # "top_k": 20, - # } + if self.settings.vllm_engine: + return { + "chat_template_kwargs": {"enable_thinking": False}, + } return {"reasoning": {"enabled": False}} async def close(self):