fix: use vLLM-native guided_json for structured output

vLLM servers support JSON schema enforcement via extra_body (guided_json
or structured_outputs), not OpenAI's response_format protocol. LangChain's
with_structured_output(method='json_schema') sends response_format which
vLLM ignores, causing NoneType not iterable parsing errors.

- vLLM path: direct OpenAI SDK call with extra_body={guided_json|structured_outputs}
- OpenRouter path: unchanged with_structured_output(method='json_schema')
- Try new 'structured_outputs' format first, fall back to legacy 'guided_json'
- Update _SEED_DECOMPOSE with explicit JSON array instruction
- Add diagnostic logging: exc_info=True, schema preview, prompt template preview
- Add logging in _parse_legacy_json for fallback failure debugging
This commit is contained in:
Woody 2026-04-29 16:49:14 +08:00
parent 2aca18d30e
commit 3ab6fd102a
3 changed files with 65 additions and 13 deletions

View File

@ -13,7 +13,8 @@ _SEED_DECOMPOSE = (
"Given this question: '{question}'\n\n" "Given this question: '{question}'\n\n"
"Break it down into 2-5 simplified sub-questions that would help " "Break it down into 2-5 simplified sub-questions that would help "
"search for relevant information. Each sub-question should be short " "search for relevant information. Each sub-question should be short "
"and focused on one aspect." "and focused on one aspect.\n\n"
'Return a JSON array of strings: ["sub-question 1", "sub-question 2", ...]'
) )
_SEED_FILTER = ( _SEED_FILTER = (

View File

@ -1,7 +1,8 @@
import json
import logging import logging
import os import os
import time import time
from typing import Optional from typing import Any, Optional
import httpx import httpx
from openai import AsyncOpenAI, APIError, APITimeoutError from openai import AsyncOpenAI, APIError, APITimeoutError
@ -131,17 +132,60 @@ class LLMClient:
self.logger.info("[%s] Structured LLM request started. Prompt: %s", step_name, prompt_preview) self.logger.info("[%s] Structured LLM request started. Prompt: %s", step_name, prompt_preview)
start_time = time.perf_counter() start_time = time.perf_counter()
extra_body = self._build_extra_body() if self.settings.vllm_engine:
self.logger.info("[%s] Structured LLM Extra Body: %s", step_name, str(extra_body)) return await self._complete_structured_vllm(prompt, pydantic_model, step_name, start_time)
return await self._complete_structured_openai(prompt, pydantic_model, step_name, start_time)
async def _complete_structured_vllm(self, prompt: str, pydantic_model, step_name: str, start_time: float):
"""Use vLLM's native guided_json via extra_body for structured output."""
schema = pydantic_model.model_json_schema()
self.logger.info(
"[%s] vLLM structured: sending guided_json schema=%s",
step_name,
json.dumps(schema)[:300],
)
# Try the new unified format first, then legacy guided_json
for fmt_name, extra in [
("structured_outputs", {"structured_outputs": {"json": schema}}),
("guided_json", {"guided_json": schema}),
]:
try:
self.logger.info("[%s] vLLM structured: trying format=%s", step_name, fmt_name)
response = await self._client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
temperature=0.0,
extra_body=extra,
)
content = response.choices[0].message.content or ""
elapsed_ms = (time.perf_counter() - start_time) * 1000
self.logger.info(
"[%s] vLLM structured succeeded with format=%s in %.2fms. Response: %s",
step_name, fmt_name, elapsed_ms, content[:200],
)
return pydantic_model.model_validate_json(content)
except Exception as exc:
elapsed_ms = (time.perf_counter() - start_time) * 1000
self.logger.warning(
"[%s] vLLM structured format=%s failed after %.2fms: %s",
step_name, fmt_name, elapsed_ms, exc,
)
elapsed_ms = (time.perf_counter() - start_time) * 1000
self.logger.error(
"[%s] vLLM structured: all formats failed after %.2fms", step_name, elapsed_ms,
exc_info=True,
)
raise LLMClientError("vLLM structured output failed with all guided formats")
async def _complete_structured_openai(self, prompt: str, pydantic_model, step_name: str, start_time: float):
"""Use OpenAI-native json_schema via LangChain's with_structured_output()."""
try: try:
model = self._get_langchain_model() model = self._get_langchain_model()
# vLLM servers may not support json_schema response_format. Use self.logger.info("[%s] Structured output method: json_schema (OpenAI-native)", step_name)
# function_calling instead, which is more widely supported by structured = model.with_structured_output(pydantic_model, method="json_schema")
# open-source models served through vLLM.
method = "function_calling" if self.settings.vllm_engine else "json_schema"
self.logger.info("[%s] Structured output method: %s", step_name, method)
structured = model.with_structured_output(pydantic_model, method=method)
result = await structured.ainvoke(prompt) result = await structured.ainvoke(prompt)
elapsed_ms = (time.perf_counter() - start_time) * 1000 elapsed_ms = (time.perf_counter() - start_time) * 1000
@ -158,5 +202,9 @@ class LLMClient:
return result return result
except Exception as exc: except Exception as exc:
elapsed_ms = (time.perf_counter() - start_time) * 1000 elapsed_ms = (time.perf_counter() - start_time) * 1000
self.logger.error("[%s] Structured LLM error after %.2fms: %s", step_name, elapsed_ms, exc) self.logger.error(
"[%s] Structured LLM error after %.2fms: %s",
step_name, elapsed_ms, exc,
exc_info=True,
)
raise LLMClientError from exc raise LLMClientError from exc

View File

@ -41,10 +41,12 @@ def _extract_json_from_markdown(response: str) -> str:
def _parse_legacy_json(response: str) -> List[str]: def _parse_legacy_json(response: str) -> List[str]:
response = _extract_json_from_markdown(response) extracted = _extract_json_from_markdown(response)
logger.info("Legacy JSON parse: extracted text (first 300 chars): %s", extracted[:300] if extracted else "(empty)")
try: try:
data = json.loads(response) data = json.loads(extracted)
except json.JSONDecodeError: except json.JSONDecodeError:
logger.warning("Legacy JSON parse: json.loads failed on extracted text")
return [] return []
if not isinstance(data, list): if not isinstance(data, list):
@ -90,6 +92,7 @@ class QueryDecomposer:
if self._prompt_service is not None: if self._prompt_service is not None:
template = self._prompt_service.get_prompt_template("decompose") template = self._prompt_service.get_prompt_template("decompose")
logger.info("Decompose prompt template (first 200 chars): %s", template[:200] if template else "(empty)")
else: else:
template = _BUILTIN_DECOMPOSE_TEMPLATE template = _BUILTIN_DECOMPOSE_TEMPLATE