From 711be3dfde2b40fcb4960b09b1a3b9cdc930055c Mon Sep 17 00:00:00 2001
From: Woody <woody.ck.tse@gmail.com>
Date: Tue, 28 Apr 2026 13:30:27 +0800
Subject: [PATCH] feat(llm): add VLLM_ENGINE env flag for provider-specific
 extra_body format

---
 backend/app/core/config.py         |  1 +
 backend/app/services/llm_client.py | 19 +++++++++----------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/backend/app/core/config.py b/backend/app/core/config.py
index 34e508c..6477f04 100644
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -13,6 +13,7 @@ class Settings(BaseSettings):
     llm_api_key: str = ""
     llm_model_name: str = "qwen/qwen3.5-35b-a3b"
     llm_enable_thinking: bool = False
+    vllm_engine: bool = False
 
     # Embeddings
     embedding_model: str = "qwen/qwen3-embedding-4b"
diff --git a/backend/app/services/llm_client.py b/backend/app/services/llm_client.py
index 6069367..0ea3263 100644
--- a/backend/app/services/llm_client.py
+++ b/backend/app/services/llm_client.py
@@ -83,20 +83,19 @@ class LLMClient:
     def _build_extra_body(self) -> dict:
         """Build extra_body for provider-specific parameters.
 
-        For Qwen3.5 models, disables thinking content via chat_template_kwargs
-        (vLLM/SGLang convention). When thinking is enabled, no extra params
-        are passed and the model uses its default thinking mode.
+        When thinking is enabled, no extra params are passed
+        and the model uses its default thinking mode.
+
+        vLLM:     {"chat_template_kwargs": {"enable_thinking": False}}
+        OpenRouter:  {"reasoning": {"enabled": False}}
         """
         if self.enable_thinking:
             return {}
 
-        # Non-thinking mode for Qwen3.5
-        # Uses chat_template_kwargs for vLLM/SGLang compatibility.
-        # For Alibaba Cloud Model Studio, use top-level enable_thinking instead.
-        # return {
-        #     "chat_template_kwargs": {"enable_thinking": False},
-        #     "top_k": 20,
-        # }
+        if self.settings.vllm_engine:
+            return {
+                "chat_template_kwargs": {"enable_thinking": False},
+            }
         return {"reasoning": {"enabled": False}}
 
     async def close(self):