fix: use vLLM-native guided_json for structured output
vLLM servers support JSON schema enforcement via extra_body (guided_json
or structured_outputs), not OpenAI's response_format protocol. LangChain's
with_structured_output(method='json_schema') sends response_format which
vLLM ignores, causing NoneType not iterable parsing errors.
- vLLM path: direct OpenAI SDK call with extra_body={guided_json|structured_outputs}
- OpenRouter path: unchanged with_structured_output(method='json_schema')
- Try new 'structured_outputs' format first, fall back to legacy 'guided_json'
- Update _SEED_DECOMPOSE with explicit JSON array instruction
- Add diagnostic logging: exc_info=True, schema preview, prompt template preview
- Add logging in _parse_legacy_json for fallback failure debugging
This commit is contained in:
parent
2aca18d30e
commit
3ab6fd102a
|
|
@ -13,7 +13,8 @@ _SEED_DECOMPOSE = (
|
||||||
"Given this question: '{question}'\n\n"
|
"Given this question: '{question}'\n\n"
|
||||||
"Break it down into 2-5 simplified sub-questions that would help "
|
"Break it down into 2-5 simplified sub-questions that would help "
|
||||||
"search for relevant information. Each sub-question should be short "
|
"search for relevant information. Each sub-question should be short "
|
||||||
"and focused on one aspect."
|
"and focused on one aspect.\n\n"
|
||||||
|
'Return a JSON array of strings: ["sub-question 1", "sub-question 2", ...]'
|
||||||
)
|
)
|
||||||
|
|
||||||
_SEED_FILTER = (
|
_SEED_FILTER = (
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,8 @@
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from openai import AsyncOpenAI, APIError, APITimeoutError
|
from openai import AsyncOpenAI, APIError, APITimeoutError
|
||||||
|
|
@ -131,17 +132,60 @@ class LLMClient:
|
||||||
self.logger.info("[%s] Structured LLM request started. Prompt: %s", step_name, prompt_preview)
|
self.logger.info("[%s] Structured LLM request started. Prompt: %s", step_name, prompt_preview)
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
|
|
||||||
extra_body = self._build_extra_body()
|
if self.settings.vllm_engine:
|
||||||
self.logger.info("[%s] Structured LLM Extra Body: %s", step_name, str(extra_body))
|
return await self._complete_structured_vllm(prompt, pydantic_model, step_name, start_time)
|
||||||
|
|
||||||
|
return await self._complete_structured_openai(prompt, pydantic_model, step_name, start_time)
|
||||||
|
|
||||||
|
async def _complete_structured_vllm(self, prompt: str, pydantic_model, step_name: str, start_time: float):
|
||||||
|
"""Use vLLM's native guided_json via extra_body for structured output."""
|
||||||
|
schema = pydantic_model.model_json_schema()
|
||||||
|
self.logger.info(
|
||||||
|
"[%s] vLLM structured: sending guided_json schema=%s",
|
||||||
|
step_name,
|
||||||
|
json.dumps(schema)[:300],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Try the new unified format first, then legacy guided_json
|
||||||
|
for fmt_name, extra in [
|
||||||
|
("structured_outputs", {"structured_outputs": {"json": schema}}),
|
||||||
|
("guided_json", {"guided_json": schema}),
|
||||||
|
]:
|
||||||
|
try:
|
||||||
|
self.logger.info("[%s] vLLM structured: trying format=%s", step_name, fmt_name)
|
||||||
|
response = await self._client.chat.completions.create(
|
||||||
|
model=self.model,
|
||||||
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
temperature=0.0,
|
||||||
|
extra_body=extra,
|
||||||
|
)
|
||||||
|
content = response.choices[0].message.content or ""
|
||||||
|
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
||||||
|
self.logger.info(
|
||||||
|
"[%s] vLLM structured succeeded with format=%s in %.2fms. Response: %s",
|
||||||
|
step_name, fmt_name, elapsed_ms, content[:200],
|
||||||
|
)
|
||||||
|
return pydantic_model.model_validate_json(content)
|
||||||
|
except Exception as exc:
|
||||||
|
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
||||||
|
self.logger.warning(
|
||||||
|
"[%s] vLLM structured format=%s failed after %.2fms: %s",
|
||||||
|
step_name, fmt_name, elapsed_ms, exc,
|
||||||
|
)
|
||||||
|
|
||||||
|
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
||||||
|
self.logger.error(
|
||||||
|
"[%s] vLLM structured: all formats failed after %.2fms", step_name, elapsed_ms,
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
raise LLMClientError("vLLM structured output failed with all guided formats")
|
||||||
|
|
||||||
|
async def _complete_structured_openai(self, prompt: str, pydantic_model, step_name: str, start_time: float):
|
||||||
|
"""Use OpenAI-native json_schema via LangChain's with_structured_output()."""
|
||||||
try:
|
try:
|
||||||
model = self._get_langchain_model()
|
model = self._get_langchain_model()
|
||||||
# vLLM servers may not support json_schema response_format. Use
|
self.logger.info("[%s] Structured output method: json_schema (OpenAI-native)", step_name)
|
||||||
# function_calling instead, which is more widely supported by
|
structured = model.with_structured_output(pydantic_model, method="json_schema")
|
||||||
# open-source models served through vLLM.
|
|
||||||
method = "function_calling" if self.settings.vllm_engine else "json_schema"
|
|
||||||
self.logger.info("[%s] Structured output method: %s", step_name, method)
|
|
||||||
structured = model.with_structured_output(pydantic_model, method=method)
|
|
||||||
result = await structured.ainvoke(prompt)
|
result = await structured.ainvoke(prompt)
|
||||||
|
|
||||||
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
||||||
|
|
@ -158,5 +202,9 @@ class LLMClient:
|
||||||
return result
|
return result
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
||||||
self.logger.error("[%s] Structured LLM error after %.2fms: %s", step_name, elapsed_ms, exc)
|
self.logger.error(
|
||||||
|
"[%s] Structured LLM error after %.2fms: %s",
|
||||||
|
step_name, elapsed_ms, exc,
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
raise LLMClientError from exc
|
raise LLMClientError from exc
|
||||||
|
|
|
||||||
|
|
@ -41,10 +41,12 @@ def _extract_json_from_markdown(response: str) -> str:
|
||||||
|
|
||||||
|
|
||||||
def _parse_legacy_json(response: str) -> List[str]:
|
def _parse_legacy_json(response: str) -> List[str]:
|
||||||
response = _extract_json_from_markdown(response)
|
extracted = _extract_json_from_markdown(response)
|
||||||
|
logger.info("Legacy JSON parse: extracted text (first 300 chars): %s", extracted[:300] if extracted else "(empty)")
|
||||||
try:
|
try:
|
||||||
data = json.loads(response)
|
data = json.loads(extracted)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
|
logger.warning("Legacy JSON parse: json.loads failed on extracted text")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
if not isinstance(data, list):
|
if not isinstance(data, list):
|
||||||
|
|
@ -90,6 +92,7 @@ class QueryDecomposer:
|
||||||
|
|
||||||
if self._prompt_service is not None:
|
if self._prompt_service is not None:
|
||||||
template = self._prompt_service.get_prompt_template("decompose")
|
template = self._prompt_service.get_prompt_template("decompose")
|
||||||
|
logger.info("Decompose prompt template (first 200 chars): %s", template[:200] if template else "(empty)")
|
||||||
else:
|
else:
|
||||||
template = _BUILTIN_DECOMPOSE_TEMPLATE
|
template = _BUILTIN_DECOMPOSE_TEMPLATE
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue