Merge branch 'Ref-doc-highlight-bug'

This commit is contained in:
Woody 2026-05-15 11:11:21 +08:00
commit 16fbb107f4
4 changed files with 180 additions and 49 deletions

View File

@ -42,7 +42,16 @@ async def compute_highlights_batch(
settings=settings, settings=settings,
) )
try: try:
logger.info(
"Highlight batch request: %d targets (history_id=%s)",
len(request.targets), history_id,
)
result = await service.compute_highlights_batch(request.targets) result = await service.compute_highlights_batch(request.targets)
logger.info(
"Highlight batch response: status=%s cached=%d errors=%d time=%dms",
result.status, result.cached_count, len(result.errors or []),
result.highlight_time_ms or 0,
)
if history_id is not None: if history_id is not None:
history_service = HistoryService(settings.history_db_path) history_service = HistoryService(settings.history_db_path)
history_service.update_highlights( history_service.update_highlights(

View File

@ -2,6 +2,7 @@
import json import json
import logging import logging
import re
import time import time
from collections import defaultdict from collections import defaultdict
from typing import Any from typing import Any
@ -98,6 +99,11 @@ class ChunkHighlightService:
if not targets: if not targets:
return HighlightBatchResponse(status="completed", cached_count=0) return HighlightBatchResponse(status="completed", cached_count=0)
logger.info(
"Highlight batch: %d targets received, fetching from ChromaDB...",
len(targets),
)
errors: list[str] = [] errors: list[str] = []
fetched: list[tuple[ChunkHighlightTarget, list[str], dict[str, Any]]] = [] fetched: list[tuple[ChunkHighlightTarget, list[str], dict[str, Any]]] = []
@ -123,6 +129,10 @@ class ChunkHighlightService:
errors.append(msg) errors.append(msg)
if not fetched: if not fetched:
logger.warning(
"Highlight batch: no chunks fetched from %d targets (errors=%d)",
len(targets), len(errors),
)
return HighlightBatchResponse( return HighlightBatchResponse(
status="completed" if not errors else "partial", status="completed" if not errors else "partial",
cached_count=0, cached_count=0,
@ -131,17 +141,59 @@ class ChunkHighlightService:
prompt = self._build_prompt(fetched) prompt = self._build_prompt(fetched)
logger.info(
"Highlight batch: %d/%d targets fetched, calling LLM (prompt len=%d)...",
len(fetched), len(targets), len(prompt),
)
highlight_start = time.perf_counter() highlight_start = time.perf_counter()
try: try:
llm_result: HighlightBatchResult = await self._llm.complete_structured( llm_result: HighlightBatchResult = await self._llm.complete_structured(
prompt, HighlightBatchResult, step_name="HighlightBatch" prompt, HighlightBatchResult, step_name="HighlightBatch"
) )
except Exception as exc: highlight_time_ms = int((time.perf_counter() - highlight_start) * 1000)
logger.error("HighlightBatch LLM call failed: %s", exc) logger.info(
return HighlightBatchResponse( "Highlight batch: LLM structured succeeded in %dms — %d results",
status="failed", cached_count=0, errors=[str(exc)] highlight_time_ms,
len(llm_result.results) if llm_result else 0,
) )
highlight_time_ms = int((time.perf_counter() - highlight_start) * 1000) except Exception as structured_exc:
logger.warning(
"HighlightBatch structured output failed: %s. "
"Falling back to plain complete() with JSON instructions.",
structured_exc,
)
try:
fallback_prompt = (
prompt
+ "\n\nRespond ONLY with a valid JSON object matching the HighlightBatchResult schema. "
"Do NOT include markdown code fences, extra commentary, or any text outside the JSON."
)
raw_response = await self._llm.complete(
fallback_prompt, temperature=0.0, step_name="HighlightBatch-Fallback"
)
# Strip any markdown fences the model may have emitted
match = re.search(
r"```(?:json)?\s*\n?(.*?)\n?```", raw_response, re.DOTALL
)
if match:
raw_response = match.group(1).strip()
llm_result = HighlightBatchResult.model_validate_json(raw_response)
highlight_time_ms = int((time.perf_counter() - highlight_start) * 1000)
logger.info(
"HighlightBatch fallback complete() succeeded in %dms — %d results",
highlight_time_ms,
len(llm_result.results) if llm_result else 0,
)
except Exception as fallback_exc:
logger.error(
"HighlightBatch fallback also failed: %s", fallback_exc
)
return HighlightBatchResponse(
status="failed",
cached_count=0,
errors=[str(structured_exc), str(fallback_exc)],
)
cached_count = self._cache_results(fetched, llm_result) cached_count = self._cache_results(fetched, llm_result)
highlight_response_json = llm_result.model_dump_json() highlight_response_json = llm_result.model_dump_json()
@ -155,6 +207,10 @@ class ChunkHighlightService:
errors.append(f"No highlight result for {doc_id}_{chunk_idx}") errors.append(f"No highlight result for {doc_id}_{chunk_idx}")
status = "partial" if (errors or missing) else "completed" status = "partial" if (errors or missing) else "completed"
logger.info(
"Highlight batch: done — status=%s cached=%d/%d errors=%d missing=%d time=%dms",
status, cached_count, len(fetched), len(errors), len(missing), highlight_time_ms,
)
return HighlightBatchResponse( return HighlightBatchResponse(
status=status, status=status,
cached_count=cached_count, cached_count=cached_count,

View File

@ -1,6 +1,7 @@
import json import json
import logging import logging
import os import os
import re
import time import time
from typing import Any, Optional from typing import Any, Optional
@ -137,58 +138,103 @@ class LLMClient:
return await self._complete_structured_openai(prompt, pydantic_model, step_name, start_time) return await self._complete_structured_openai(prompt, pydantic_model, step_name, start_time)
def _strip_markdown_fence(self, content: str) -> str:
"""Strip markdown code fences from LLM output if present.
Some models (especially vLLM) may wrap JSON output in ```json ... ``` fences
even when structured output is requested.
"""
match = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", content, re.DOTALL)
if match:
return match.group(1).strip()
return content
async def _complete_structured_vllm(self, prompt: str, pydantic_model, step_name: str, start_time: float): async def _complete_structured_vllm(self, prompt: str, pydantic_model, step_name: str, start_time: float):
"""Use vLLM's native guided_json via extra_body for structured output.""" """Use vLLM-native structured output via response_format or structured_outputs extra_body.
Tier 1 (recommended): OpenAI-native response_format (vLLM v0.6.4+)
- Portable across OpenRouter, OpenAI, and vLLM
- vLLM RFC #19097 confirms this is the future direction
Tier 2 (fallback): extra_body structured_outputs (vLLM v0.8+)
- Battle-tested fallback for v0.8+ deployments
guided_json is deliberately removed it was removed in vLLM v0.12.0.
"""
schema = pydantic_model.model_json_schema() schema = pydantic_model.model_json_schema()
model_name = pydantic_model.__name__
prompt_preview = self._truncate_prompt_for_log(prompt, first_chars=300, last_chars=100) prompt_preview = self._truncate_prompt_for_log(prompt, first_chars=300, last_chars=100)
self.logger.info( self.logger.info(
"[%s] vLLM structured: prompt=%s schema=%s", "[%s] vLLM structured: prompt=%s schema=%s",
step_name, prompt_preview, json.dumps(schema)[:300], step_name, prompt_preview, json.dumps(schema)[:300],
) )
# Merge thinking-control params so vLLM structured calls # ------------------------------------------------------------------
# also respect enable_thinking/vllm_engine config (was missing). # Tier 1: OpenAI-native response_format (vLLM v0.6.4+)
body = self._build_extra_body() # ------------------------------------------------------------------
# vLLM's protocol.py to_sampling_params() converts this to
# StructuredOutputsParams since PR #7654 (Aug 2024). RFC #19097
# confirms this will become the primary structured output API.
try:
response = await self._client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
temperature=0.0,
response_format={
"type": "json_schema",
"json_schema": {"name": model_name, "schema": schema},
},
)
content = response.choices[0].message.content or ""
content = self._strip_markdown_fence(content)
elapsed_ms = (time.perf_counter() - start_time) * 1000
self.logger.info(
"[%s] vLLM structured succeeded via response_format in %.2fms "
"tokens=(%s/%s)",
step_name, elapsed_ms,
response.usage.prompt_tokens if response.usage else "?",
response.usage.completion_tokens if response.usage else "?",
)
return pydantic_model.model_validate_json(content)
except Exception as exc:
elapsed_ms = (time.perf_counter() - start_time) * 1000
self.logger.warning(
"[%s] response_format failed after %.2fms: %s. "
"Trying structured_outputs fallback.",
step_name, elapsed_ms, exc,
)
# Try the new unified format first, then legacy guided_json # ------------------------------------------------------------------
for fmt_name, base_extra in [ # Tier 2: extra_body structured_outputs (vLLM v0.8+)
("structured_outputs", {"structured_outputs": {"json": schema}}), # ------------------------------------------------------------------
("guided_json", {"guided_json": schema}), # This is the battle-tested format with extensive test coverage in
]: # vLLM's test_chat.py. Used by multiple real-world projects.
extra = {**base_extra, **body} try:
try: response = await self._client.chat.completions.create(
self.logger.info("[%s] vLLM structured: trying format=%s extra=%s", step_name, fmt_name, extra) model=self.model,
response = await self._client.chat.completions.create( messages=[{"role": "user", "content": prompt}],
model=self.model, temperature=0.0,
messages=[{"role": "user", "content": prompt}], extra_body={"structured_outputs": {"json": schema}},
temperature=0.0, )
extra_body=extra, content = response.choices[0].message.content or ""
) content = self._strip_markdown_fence(content)
self.logger.info("[%s] vLLM structured full response: %s", step_name, response) elapsed_ms = (time.perf_counter() - start_time) * 1000
content = response.choices[0].message.content or "" self.logger.info(
elapsed_ms = (time.perf_counter() - start_time) * 1000 "[%s] vLLM structured succeeded via structured_outputs in %.2fms "
self.logger.info( "tokens=(%s/%s)",
"[%s] vLLM structured succeeded format=%s in %.2fms tokens=(%s/%s). Parsed=%s", step_name, elapsed_ms,
step_name, fmt_name, elapsed_ms, response.usage.prompt_tokens if response.usage else "?",
response.usage.prompt_tokens if response.usage else "?", response.usage.completion_tokens if response.usage else "?",
response.usage.completion_tokens if response.usage else "?", )
content, return pydantic_model.model_validate_json(content)
) except Exception as exc:
return pydantic_model.model_validate_json(content) elapsed_ms = (time.perf_counter() - start_time) * 1000
except Exception as exc: self.logger.error(
elapsed_ms = (time.perf_counter() - start_time) * 1000 "[%s] Both response_format and structured_outputs failed after %.2fms",
self.logger.warning( step_name, elapsed_ms, exc_info=True,
"[%s] vLLM structured format=%s failed after %.2fms: %s", )
step_name, fmt_name, elapsed_ms, exc,
exc_info=True,
)
elapsed_ms = (time.perf_counter() - start_time) * 1000 raise LLMClientError("vLLM structured output failed with all supported formats")
self.logger.error(
"[%s] vLLM structured: all formats failed after %.2fms", step_name, elapsed_ms,
exc_info=True,
)
raise LLMClientError("vLLM structured output failed with all guided formats")
async def _complete_structured_openai(self, prompt: str, pydantic_model, step_name: str, start_time: float): async def _complete_structured_openai(self, prompt: str, pydantic_model, step_name: str, start_time: float):
"""Use OpenAI-native json_schema via LangChain's with_structured_output().""" """Use OpenAI-native json_schema via LangChain's with_structured_output()."""

View File

@ -204,6 +204,12 @@ function SubQuestionSections({
setHighlightStatus('loading') setHighlightStatus('loading')
console.log(
`[Highlight] Sending batch request: ${targets.length} targets`,
targets.map((t: { document_id: string; chunk_index: number }) => `${t.document_id}#${t.chunk_index}`),
)
const startTime = performance.now()
const url = historyId const url = historyId
? `http://localhost:8000/api/v1/v2/highlights/batch?history_id=${historyId}` ? `http://localhost:8000/api/v1/v2/highlights/batch?history_id=${historyId}`
: 'http://localhost:8000/api/v1/v2/highlights/batch' : 'http://localhost:8000/api/v1/v2/highlights/batch'
@ -213,10 +219,23 @@ function SubQuestionSections({
body: JSON.stringify({ targets }), body: JSON.stringify({ targets }),
}) })
.then((res) => res.json()) .then((res) => res.json())
.then((data) => { .then((data: { status: string; cached_count?: number; errors?: string[]; highlight_time_ms?: number }) => {
const elapsed = Math.round(performance.now() - startTime)
console.log(
`[Highlight] Response: status=${data.status} cached=${data.cached_count ?? '?'} ` +
`errors=${data.errors?.length ?? 0} backend_time=${data.highlight_time_ms ?? '?'}ms total=${elapsed}ms`,
)
if (data.errors?.length) {
console.warn('[Highlight] Errors:', data.errors)
}
if (data.status === 'failed') {
console.error('[Highlight] Batch computation failed:', data.errors)
setHighlightStatus('')
return
}
if (data.status === 'completed' || data.status === 'partial') { if (data.status === 'completed' || data.status === 'partial') {
const keys = new Set<string>() const keys = new Set<string>()
targets.forEach((t) => { targets.forEach((t: { document_id: string; chunk_index: number; sub_question_text: string }) => {
keys.add(`${t.document_id}_${t.chunk_index}_${encodeURIComponent(t.sub_question_text)}`) keys.add(`${t.document_id}_${t.chunk_index}_${encodeURIComponent(t.sub_question_text)}`)
}) })
setHighlightReadyKeys(keys) setHighlightReadyKeys(keys)
@ -226,6 +245,7 @@ function SubQuestionSections({
}) })
.catch((err) => { .catch((err) => {
console.error('Highlight batch computation failed:', err) console.error('Highlight batch computation failed:', err)
setHighlightStatus('')
}) })
}, [answer, isLoading, subQuestionSources, historyId]) }, [answer, isLoading, subQuestionSources, historyId])