diff --git a/backend/app/core/sqlite_db.py b/backend/app/core/sqlite_db.py index 0bcd724..73b1d96 100644 --- a/backend/app/core/sqlite_db.py +++ b/backend/app/core/sqlite_db.py @@ -13,7 +13,8 @@ _SEED_DECOMPOSE = ( "Given this question: '{question}'\n\n" "Break it down into 2-5 simplified sub-questions that would help " "search for relevant information. Each sub-question should be short " - "and focused on one aspect." + "and focused on one aspect.\n\n" + 'Return a JSON array of strings: ["sub-question 1", "sub-question 2", ...]' ) _SEED_FILTER = ( diff --git a/backend/app/services/llm_client.py b/backend/app/services/llm_client.py index 1672997..b0b8154 100644 --- a/backend/app/services/llm_client.py +++ b/backend/app/services/llm_client.py @@ -1,7 +1,8 @@ +import json import logging import os import time -from typing import Optional +from typing import Any, Optional import httpx from openai import AsyncOpenAI, APIError, APITimeoutError @@ -131,17 +132,60 @@ class LLMClient: self.logger.info("[%s] Structured LLM request started. Prompt: %s", step_name, prompt_preview) start_time = time.perf_counter() - extra_body = self._build_extra_body() - self.logger.info("[%s] Structured LLM Extra Body: %s", step_name, str(extra_body)) + if self.settings.vllm_engine: + return await self._complete_structured_vllm(prompt, pydantic_model, step_name, start_time) + return await self._complete_structured_openai(prompt, pydantic_model, step_name, start_time) + + async def _complete_structured_vllm(self, prompt: str, pydantic_model, step_name: str, start_time: float): + """Use vLLM's native guided_json via extra_body for structured output.""" + schema = pydantic_model.model_json_schema() + self.logger.info( + "[%s] vLLM structured: sending guided_json schema=%s", + step_name, + json.dumps(schema)[:300], + ) + + # Try the new unified format first, then legacy guided_json + for fmt_name, extra in [ + ("structured_outputs", {"structured_outputs": {"json": schema}}), + ("guided_json", {"guided_json": schema}), + ]: + try: + self.logger.info("[%s] vLLM structured: trying format=%s", step_name, fmt_name) + response = await self._client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": prompt}], + temperature=0.0, + extra_body=extra, + ) + content = response.choices[0].message.content or "" + elapsed_ms = (time.perf_counter() - start_time) * 1000 + self.logger.info( + "[%s] vLLM structured succeeded with format=%s in %.2fms. Response: %s", + step_name, fmt_name, elapsed_ms, content[:200], + ) + return pydantic_model.model_validate_json(content) + except Exception as exc: + elapsed_ms = (time.perf_counter() - start_time) * 1000 + self.logger.warning( + "[%s] vLLM structured format=%s failed after %.2fms: %s", + step_name, fmt_name, elapsed_ms, exc, + ) + + elapsed_ms = (time.perf_counter() - start_time) * 1000 + self.logger.error( + "[%s] vLLM structured: all formats failed after %.2fms", step_name, elapsed_ms, + exc_info=True, + ) + raise LLMClientError("vLLM structured output failed with all guided formats") + + async def _complete_structured_openai(self, prompt: str, pydantic_model, step_name: str, start_time: float): + """Use OpenAI-native json_schema via LangChain's with_structured_output().""" try: model = self._get_langchain_model() - # vLLM servers may not support json_schema response_format. Use - # function_calling instead, which is more widely supported by - # open-source models served through vLLM. - method = "function_calling" if self.settings.vllm_engine else "json_schema" - self.logger.info("[%s] Structured output method: %s", step_name, method) - structured = model.with_structured_output(pydantic_model, method=method) + self.logger.info("[%s] Structured output method: json_schema (OpenAI-native)", step_name) + structured = model.with_structured_output(pydantic_model, method="json_schema") result = await structured.ainvoke(prompt) elapsed_ms = (time.perf_counter() - start_time) * 1000 @@ -158,5 +202,9 @@ class LLMClient: return result except Exception as exc: elapsed_ms = (time.perf_counter() - start_time) * 1000 - self.logger.error("[%s] Structured LLM error after %.2fms: %s", step_name, elapsed_ms, exc) + self.logger.error( + "[%s] Structured LLM error after %.2fms: %s", + step_name, elapsed_ms, exc, + exc_info=True, + ) raise LLMClientError from exc diff --git a/backend/app/services/query_decomposer.py b/backend/app/services/query_decomposer.py index 4b45093..e7293e6 100644 --- a/backend/app/services/query_decomposer.py +++ b/backend/app/services/query_decomposer.py @@ -41,10 +41,12 @@ def _extract_json_from_markdown(response: str) -> str: def _parse_legacy_json(response: str) -> List[str]: - response = _extract_json_from_markdown(response) + extracted = _extract_json_from_markdown(response) + logger.info("Legacy JSON parse: extracted text (first 300 chars): %s", extracted[:300] if extracted else "(empty)") try: - data = json.loads(response) + data = json.loads(extracted) except json.JSONDecodeError: + logger.warning("Legacy JSON parse: json.loads failed on extracted text") return [] if not isinstance(data, list): @@ -90,6 +92,7 @@ class QueryDecomposer: if self._prompt_service is not None: template = self._prompt_service.get_prompt_template("decompose") + logger.info("Decompose prompt template (first 200 chars): %s", template[:200] if template else "(empty)") else: template = _BUILTIN_DECOMPOSE_TEMPLATE