feat(llm): add VLLM_ENGINE env flag for provider-specific extra_body format

This commit is contained in:
Woody 2026-04-28 13:30:27 +08:00
parent aa5f716578
commit 711be3dfde
2 changed files with 10 additions and 10 deletions

View File

@ -13,6 +13,7 @@ class Settings(BaseSettings):
llm_api_key: str = "" llm_api_key: str = ""
llm_model_name: str = "qwen/qwen3.5-35b-a3b" llm_model_name: str = "qwen/qwen3.5-35b-a3b"
llm_enable_thinking: bool = False llm_enable_thinking: bool = False
vllm_engine: bool = False
# Embeddings # Embeddings
embedding_model: str = "qwen/qwen3-embedding-4b" embedding_model: str = "qwen/qwen3-embedding-4b"

View File

@ -83,20 +83,19 @@ class LLMClient:
def _build_extra_body(self) -> dict: def _build_extra_body(self) -> dict:
"""Build extra_body for provider-specific parameters. """Build extra_body for provider-specific parameters.
For Qwen3.5 models, disables thinking content via chat_template_kwargs When thinking is enabled, no extra params are passed
(vLLM/SGLang convention). When thinking is enabled, no extra params and the model uses its default thinking mode.
are passed and the model uses its default thinking mode.
vLLM: {"chat_template_kwargs": {"enable_thinking": False}}
OpenRouter: {"reasoning": {"enabled": False}}
""" """
if self.enable_thinking: if self.enable_thinking:
return {} return {}
# Non-thinking mode for Qwen3.5 if self.settings.vllm_engine:
# Uses chat_template_kwargs for vLLM/SGLang compatibility. return {
# For Alibaba Cloud Model Studio, use top-level enable_thinking instead. "chat_template_kwargs": {"enable_thinking": False},
# return { }
# "chat_template_kwargs": {"enable_thinking": False},
# "top_k": 20,
# }
return {"reasoning": {"enabled": False}} return {"reasoning": {"enabled": False}}
async def close(self): async def close(self):