feat(llm): add VLLM_ENGINE env flag for provider-specific extra_body format
This commit is contained in:
parent
aa5f716578
commit
711be3dfde
|
|
@ -13,6 +13,7 @@ class Settings(BaseSettings):
|
|||
llm_api_key: str = ""
|
||||
llm_model_name: str = "qwen/qwen3.5-35b-a3b"
|
||||
llm_enable_thinking: bool = False
|
||||
vllm_engine: bool = False
|
||||
|
||||
# Embeddings
|
||||
embedding_model: str = "qwen/qwen3-embedding-4b"
|
||||
|
|
|
|||
|
|
@ -83,20 +83,19 @@ class LLMClient:
|
|||
def _build_extra_body(self) -> dict:
|
||||
"""Build extra_body for provider-specific parameters.
|
||||
|
||||
For Qwen3.5 models, disables thinking content via chat_template_kwargs
|
||||
(vLLM/SGLang convention). When thinking is enabled, no extra params
|
||||
are passed and the model uses its default thinking mode.
|
||||
When thinking is enabled, no extra params are passed
|
||||
and the model uses its default thinking mode.
|
||||
|
||||
vLLM: {"chat_template_kwargs": {"enable_thinking": False}}
|
||||
OpenRouter: {"reasoning": {"enabled": False}}
|
||||
"""
|
||||
if self.enable_thinking:
|
||||
return {}
|
||||
|
||||
# Non-thinking mode for Qwen3.5
|
||||
# Uses chat_template_kwargs for vLLM/SGLang compatibility.
|
||||
# For Alibaba Cloud Model Studio, use top-level enable_thinking instead.
|
||||
# return {
|
||||
# "chat_template_kwargs": {"enable_thinking": False},
|
||||
# "top_k": 20,
|
||||
# }
|
||||
if self.settings.vllm_engine:
|
||||
return {
|
||||
"chat_template_kwargs": {"enable_thinking": False},
|
||||
}
|
||||
return {"reasoning": {"enabled": False}}
|
||||
|
||||
async def close(self):
|
||||
|
|
|
|||
Loading…
Reference in New Issue