From 711be3dfde2b40fcb4960b09b1a3b9cdc930055c Mon Sep 17 00:00:00 2001 From: Woody Date: Tue, 28 Apr 2026 13:30:27 +0800 Subject: [PATCH] feat(llm): add VLLM_ENGINE env flag for provider-specific extra_body format --- backend/app/core/config.py | 1 + backend/app/services/llm_client.py | 19 +++++++++---------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 34e508c..6477f04 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -13,6 +13,7 @@ class Settings(BaseSettings): llm_api_key: str = "" llm_model_name: str = "qwen/qwen3.5-35b-a3b" llm_enable_thinking: bool = False + vllm_engine: bool = False # Embeddings embedding_model: str = "qwen/qwen3-embedding-4b" diff --git a/backend/app/services/llm_client.py b/backend/app/services/llm_client.py index 6069367..0ea3263 100644 --- a/backend/app/services/llm_client.py +++ b/backend/app/services/llm_client.py @@ -83,20 +83,19 @@ class LLMClient: def _build_extra_body(self) -> dict: """Build extra_body for provider-specific parameters. - For Qwen3.5 models, disables thinking content via chat_template_kwargs - (vLLM/SGLang convention). When thinking is enabled, no extra params - are passed and the model uses its default thinking mode. + When thinking is enabled, no extra params are passed + and the model uses its default thinking mode. + + vLLM: {"chat_template_kwargs": {"enable_thinking": False}} + OpenRouter: {"reasoning": {"enabled": False}} """ if self.enable_thinking: return {} - # Non-thinking mode for Qwen3.5 - # Uses chat_template_kwargs for vLLM/SGLang compatibility. - # For Alibaba Cloud Model Studio, use top-level enable_thinking instead. - # return { - # "chat_template_kwargs": {"enable_thinking": False}, - # "top_k": 20, - # } + if self.settings.vllm_engine: + return { + "chat_template_kwargs": {"enable_thinking": False}, + } return {"reasoning": {"enabled": False}} async def close(self):