diff --git a/backend/app/routers/chunks.py b/backend/app/routers/chunks.py index 8ff58bf..298b3a5 100644 --- a/backend/app/routers/chunks.py +++ b/backend/app/routers/chunks.py @@ -42,7 +42,16 @@ async def compute_highlights_batch( settings=settings, ) try: + logger.info( + "Highlight batch request: %d targets (history_id=%s)", + len(request.targets), history_id, + ) result = await service.compute_highlights_batch(request.targets) + logger.info( + "Highlight batch response: status=%s cached=%d errors=%d time=%dms", + result.status, result.cached_count, len(result.errors or []), + result.highlight_time_ms or 0, + ) if history_id is not None: history_service = HistoryService(settings.history_db_path) history_service.update_highlights( diff --git a/backend/app/services/chunk_highlight_service.py b/backend/app/services/chunk_highlight_service.py index 694313c..5851b80 100644 --- a/backend/app/services/chunk_highlight_service.py +++ b/backend/app/services/chunk_highlight_service.py @@ -2,6 +2,7 @@ import json import logging +import re import time from collections import defaultdict from typing import Any @@ -98,6 +99,11 @@ class ChunkHighlightService: if not targets: return HighlightBatchResponse(status="completed", cached_count=0) + logger.info( + "Highlight batch: %d targets received, fetching from ChromaDB...", + len(targets), + ) + errors: list[str] = [] fetched: list[tuple[ChunkHighlightTarget, list[str], dict[str, Any]]] = [] @@ -123,6 +129,10 @@ class ChunkHighlightService: errors.append(msg) if not fetched: + logger.warning( + "Highlight batch: no chunks fetched from %d targets (errors=%d)", + len(targets), len(errors), + ) return HighlightBatchResponse( status="completed" if not errors else "partial", cached_count=0, @@ -131,17 +141,59 @@ class ChunkHighlightService: prompt = self._build_prompt(fetched) + logger.info( + "Highlight batch: %d/%d targets fetched, calling LLM (prompt len=%d)...", + len(fetched), len(targets), len(prompt), + ) + highlight_start = time.perf_counter() try: llm_result: HighlightBatchResult = await self._llm.complete_structured( prompt, HighlightBatchResult, step_name="HighlightBatch" ) - except Exception as exc: - logger.error("HighlightBatch LLM call failed: %s", exc) - return HighlightBatchResponse( - status="failed", cached_count=0, errors=[str(exc)] + highlight_time_ms = int((time.perf_counter() - highlight_start) * 1000) + logger.info( + "Highlight batch: LLM structured succeeded in %dms — %d results", + highlight_time_ms, + len(llm_result.results) if llm_result else 0, ) - highlight_time_ms = int((time.perf_counter() - highlight_start) * 1000) + except Exception as structured_exc: + logger.warning( + "HighlightBatch structured output failed: %s. " + "Falling back to plain complete() with JSON instructions.", + structured_exc, + ) + try: + fallback_prompt = ( + prompt + + "\n\nRespond ONLY with a valid JSON object matching the HighlightBatchResult schema. " + "Do NOT include markdown code fences, extra commentary, or any text outside the JSON." + ) + raw_response = await self._llm.complete( + fallback_prompt, temperature=0.0, step_name="HighlightBatch-Fallback" + ) + # Strip any markdown fences the model may have emitted + match = re.search( + r"```(?:json)?\s*\n?(.*?)\n?```", raw_response, re.DOTALL + ) + if match: + raw_response = match.group(1).strip() + llm_result = HighlightBatchResult.model_validate_json(raw_response) + highlight_time_ms = int((time.perf_counter() - highlight_start) * 1000) + logger.info( + "HighlightBatch fallback complete() succeeded in %dms — %d results", + highlight_time_ms, + len(llm_result.results) if llm_result else 0, + ) + except Exception as fallback_exc: + logger.error( + "HighlightBatch fallback also failed: %s", fallback_exc + ) + return HighlightBatchResponse( + status="failed", + cached_count=0, + errors=[str(structured_exc), str(fallback_exc)], + ) cached_count = self._cache_results(fetched, llm_result) highlight_response_json = llm_result.model_dump_json() @@ -155,6 +207,10 @@ class ChunkHighlightService: errors.append(f"No highlight result for {doc_id}_{chunk_idx}") status = "partial" if (errors or missing) else "completed" + logger.info( + "Highlight batch: done — status=%s cached=%d/%d errors=%d missing=%d time=%dms", + status, cached_count, len(fetched), len(errors), len(missing), highlight_time_ms, + ) return HighlightBatchResponse( status=status, cached_count=cached_count, diff --git a/backend/app/services/llm_client.py b/backend/app/services/llm_client.py index e54c454..a5553b0 100644 --- a/backend/app/services/llm_client.py +++ b/backend/app/services/llm_client.py @@ -1,6 +1,7 @@ import json import logging import os +import re import time from typing import Any, Optional @@ -137,58 +138,103 @@ class LLMClient: return await self._complete_structured_openai(prompt, pydantic_model, step_name, start_time) + def _strip_markdown_fence(self, content: str) -> str: + """Strip markdown code fences from LLM output if present. + + Some models (especially vLLM) may wrap JSON output in ```json ... ``` fences + even when structured output is requested. + """ + match = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", content, re.DOTALL) + if match: + return match.group(1).strip() + return content + async def _complete_structured_vllm(self, prompt: str, pydantic_model, step_name: str, start_time: float): - """Use vLLM's native guided_json via extra_body for structured output.""" + """Use vLLM-native structured output via response_format or structured_outputs extra_body. + + Tier 1 (recommended): OpenAI-native response_format (vLLM v0.6.4+) + - Portable across OpenRouter, OpenAI, and vLLM + - vLLM RFC #19097 confirms this is the future direction + + Tier 2 (fallback): extra_body structured_outputs (vLLM v0.8+) + - Battle-tested fallback for v0.8+ deployments + + guided_json is deliberately removed — it was removed in vLLM v0.12.0. + """ schema = pydantic_model.model_json_schema() + model_name = pydantic_model.__name__ prompt_preview = self._truncate_prompt_for_log(prompt, first_chars=300, last_chars=100) self.logger.info( "[%s] vLLM structured: prompt=%s schema=%s", step_name, prompt_preview, json.dumps(schema)[:300], ) - # Merge thinking-control params so vLLM structured calls - # also respect enable_thinking/vllm_engine config (was missing). - body = self._build_extra_body() + # ------------------------------------------------------------------ + # Tier 1: OpenAI-native response_format (vLLM v0.6.4+) + # ------------------------------------------------------------------ + # vLLM's protocol.py to_sampling_params() converts this to + # StructuredOutputsParams since PR #7654 (Aug 2024). RFC #19097 + # confirms this will become the primary structured output API. + try: + response = await self._client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": prompt}], + temperature=0.0, + response_format={ + "type": "json_schema", + "json_schema": {"name": model_name, "schema": schema}, + }, + ) + content = response.choices[0].message.content or "" + content = self._strip_markdown_fence(content) + elapsed_ms = (time.perf_counter() - start_time) * 1000 + self.logger.info( + "[%s] vLLM structured succeeded via response_format in %.2fms " + "tokens=(%s/%s)", + step_name, elapsed_ms, + response.usage.prompt_tokens if response.usage else "?", + response.usage.completion_tokens if response.usage else "?", + ) + return pydantic_model.model_validate_json(content) + except Exception as exc: + elapsed_ms = (time.perf_counter() - start_time) * 1000 + self.logger.warning( + "[%s] response_format failed after %.2fms: %s. " + "Trying structured_outputs fallback.", + step_name, elapsed_ms, exc, + ) - # Try the new unified format first, then legacy guided_json - for fmt_name, base_extra in [ - ("structured_outputs", {"structured_outputs": {"json": schema}}), - ("guided_json", {"guided_json": schema}), - ]: - extra = {**base_extra, **body} - try: - self.logger.info("[%s] vLLM structured: trying format=%s extra=%s", step_name, fmt_name, extra) - response = await self._client.chat.completions.create( - model=self.model, - messages=[{"role": "user", "content": prompt}], - temperature=0.0, - extra_body=extra, - ) - self.logger.info("[%s] vLLM structured full response: %s", step_name, response) - content = response.choices[0].message.content or "" - elapsed_ms = (time.perf_counter() - start_time) * 1000 - self.logger.info( - "[%s] vLLM structured succeeded format=%s in %.2fms tokens=(%s/%s). Parsed=%s", - step_name, fmt_name, elapsed_ms, - response.usage.prompt_tokens if response.usage else "?", - response.usage.completion_tokens if response.usage else "?", - content, - ) - return pydantic_model.model_validate_json(content) - except Exception as exc: - elapsed_ms = (time.perf_counter() - start_time) * 1000 - self.logger.warning( - "[%s] vLLM structured format=%s failed after %.2fms: %s", - step_name, fmt_name, elapsed_ms, exc, - exc_info=True, - ) + # ------------------------------------------------------------------ + # Tier 2: extra_body structured_outputs (vLLM v0.8+) + # ------------------------------------------------------------------ + # This is the battle-tested format with extensive test coverage in + # vLLM's test_chat.py. Used by multiple real-world projects. + try: + response = await self._client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": prompt}], + temperature=0.0, + extra_body={"structured_outputs": {"json": schema}}, + ) + content = response.choices[0].message.content or "" + content = self._strip_markdown_fence(content) + elapsed_ms = (time.perf_counter() - start_time) * 1000 + self.logger.info( + "[%s] vLLM structured succeeded via structured_outputs in %.2fms " + "tokens=(%s/%s)", + step_name, elapsed_ms, + response.usage.prompt_tokens if response.usage else "?", + response.usage.completion_tokens if response.usage else "?", + ) + return pydantic_model.model_validate_json(content) + except Exception as exc: + elapsed_ms = (time.perf_counter() - start_time) * 1000 + self.logger.error( + "[%s] Both response_format and structured_outputs failed after %.2fms", + step_name, elapsed_ms, exc_info=True, + ) - elapsed_ms = (time.perf_counter() - start_time) * 1000 - self.logger.error( - "[%s] vLLM structured: all formats failed after %.2fms", step_name, elapsed_ms, - exc_info=True, - ) - raise LLMClientError("vLLM structured output failed with all guided formats") + raise LLMClientError("vLLM structured output failed with all supported formats") async def _complete_structured_openai(self, prompt: str, pydantic_model, step_name: str, start_time: float): """Use OpenAI-native json_schema via LangChain's with_structured_output().""" diff --git a/frontend/src/components/ResponsePanel.tsx b/frontend/src/components/ResponsePanel.tsx index b93c6ba..b6050a3 100644 --- a/frontend/src/components/ResponsePanel.tsx +++ b/frontend/src/components/ResponsePanel.tsx @@ -204,6 +204,12 @@ function SubQuestionSections({ setHighlightStatus('loading') + console.log( + `[Highlight] Sending batch request: ${targets.length} targets`, + targets.map((t: { document_id: string; chunk_index: number }) => `${t.document_id}#${t.chunk_index}`), + ) + const startTime = performance.now() + const url = historyId ? `http://localhost:8000/api/v1/v2/highlights/batch?history_id=${historyId}` : 'http://localhost:8000/api/v1/v2/highlights/batch' @@ -213,10 +219,23 @@ function SubQuestionSections({ body: JSON.stringify({ targets }), }) .then((res) => res.json()) - .then((data) => { + .then((data: { status: string; cached_count?: number; errors?: string[]; highlight_time_ms?: number }) => { + const elapsed = Math.round(performance.now() - startTime) + console.log( + `[Highlight] Response: status=${data.status} cached=${data.cached_count ?? '?'} ` + + `errors=${data.errors?.length ?? 0} backend_time=${data.highlight_time_ms ?? '?'}ms total=${elapsed}ms`, + ) + if (data.errors?.length) { + console.warn('[Highlight] Errors:', data.errors) + } + if (data.status === 'failed') { + console.error('[Highlight] Batch computation failed:', data.errors) + setHighlightStatus('') + return + } if (data.status === 'completed' || data.status === 'partial') { const keys = new Set() - targets.forEach((t) => { + targets.forEach((t: { document_id: string; chunk_index: number; sub_question_text: string }) => { keys.add(`${t.document_id}_${t.chunk_index}_${encodeURIComponent(t.sub_question_text)}`) }) setHighlightReadyKeys(keys) @@ -226,6 +245,7 @@ function SubQuestionSections({ }) .catch((err) => { console.error('Highlight batch computation failed:', err) + setHighlightStatus('') }) }, [answer, isLoading, subQuestionSources, historyId])