diff --git a/backend/app/routers/chunks.py b/backend/app/routers/chunks.py
index 8ff58bf..298b3a5 100644
--- a/backend/app/routers/chunks.py
+++ b/backend/app/routers/chunks.py
@@ -42,7 +42,16 @@ async def compute_highlights_batch(
         settings=settings,
     )
     try:
+        logger.info(
+            "Highlight batch request: %d targets (history_id=%s)",
+            len(request.targets), history_id,
+        )
         result = await service.compute_highlights_batch(request.targets)
+        logger.info(
+            "Highlight batch response: status=%s cached=%d errors=%d time=%dms",
+            result.status, result.cached_count, len(result.errors or []),
+            result.highlight_time_ms or 0,
+        )
         if history_id is not None:
             history_service = HistoryService(settings.history_db_path)
             history_service.update_highlights(
diff --git a/backend/app/services/chunk_highlight_service.py b/backend/app/services/chunk_highlight_service.py
index 694313c..5851b80 100644
--- a/backend/app/services/chunk_highlight_service.py
+++ b/backend/app/services/chunk_highlight_service.py
@@ -2,6 +2,7 @@
 
 import json
 import logging
+import re
 import time
 from collections import defaultdict
 from typing import Any
@@ -98,6 +99,11 @@ class ChunkHighlightService:
         if not targets:
             return HighlightBatchResponse(status="completed", cached_count=0)
 
+        logger.info(
+            "Highlight batch: %d targets received, fetching from ChromaDB...",
+            len(targets),
+        )
+
         errors: list[str] = []
         fetched: list[tuple[ChunkHighlightTarget, list[str], dict[str, Any]]] = []
 
@@ -123,6 +129,10 @@ class ChunkHighlightService:
                 errors.append(msg)
 
         if not fetched:
+            logger.warning(
+                "Highlight batch: no chunks fetched from %d targets (errors=%d)",
+                len(targets), len(errors),
+            )
             return HighlightBatchResponse(
                 status="completed" if not errors else "partial",
                 cached_count=0,
@@ -131,17 +141,59 @@ class ChunkHighlightService:
 
         prompt = self._build_prompt(fetched)
 
+        logger.info(
+            "Highlight batch: %d/%d targets fetched, calling LLM (prompt len=%d)...",
+            len(fetched), len(targets), len(prompt),
+        )
+
         highlight_start = time.perf_counter()
         try:
             llm_result: HighlightBatchResult = await self._llm.complete_structured(
                 prompt, HighlightBatchResult, step_name="HighlightBatch"
             )
-        except Exception as exc:
-            logger.error("HighlightBatch LLM call failed: %s", exc)
-            return HighlightBatchResponse(
-                status="failed", cached_count=0, errors=[str(exc)]
+            highlight_time_ms = int((time.perf_counter() - highlight_start) * 1000)
+            logger.info(
+                "Highlight batch: LLM structured succeeded in %dms — %d results",
+                highlight_time_ms,
+                len(llm_result.results) if llm_result else 0,
             )
-        highlight_time_ms = int((time.perf_counter() - highlight_start) * 1000)
+        except Exception as structured_exc:
+            logger.warning(
+                "HighlightBatch structured output failed: %s. "
+                "Falling back to plain complete() with JSON instructions.",
+                structured_exc,
+            )
+            try:
+                fallback_prompt = (
+                    prompt
+                    + "\n\nRespond ONLY with a valid JSON object matching the HighlightBatchResult schema. "
+                    "Do NOT include markdown code fences, extra commentary, or any text outside the JSON."
+                )
+                raw_response = await self._llm.complete(
+                    fallback_prompt, temperature=0.0, step_name="HighlightBatch-Fallback"
+                )
+                # Strip any markdown fences the model may have emitted
+                match = re.search(
+                    r"```(?:json)?\s*\n?(.*?)\n?```", raw_response, re.DOTALL
+                )
+                if match:
+                    raw_response = match.group(1).strip()
+                llm_result = HighlightBatchResult.model_validate_json(raw_response)
+                highlight_time_ms = int((time.perf_counter() - highlight_start) * 1000)
+                logger.info(
+                    "HighlightBatch fallback complete() succeeded in %dms — %d results",
+                    highlight_time_ms,
+                    len(llm_result.results) if llm_result else 0,
+                )
+            except Exception as fallback_exc:
+                logger.error(
+                    "HighlightBatch fallback also failed: %s", fallback_exc
+                )
+                return HighlightBatchResponse(
+                    status="failed",
+                    cached_count=0,
+                    errors=[str(structured_exc), str(fallback_exc)],
+                )
 
         cached_count = self._cache_results(fetched, llm_result)
         highlight_response_json = llm_result.model_dump_json()
@@ -155,6 +207,10 @@ class ChunkHighlightService:
                 errors.append(f"No highlight result for {doc_id}_{chunk_idx}")
 
         status = "partial" if (errors or missing) else "completed"
+        logger.info(
+            "Highlight batch: done — status=%s cached=%d/%d errors=%d missing=%d time=%dms",
+            status, cached_count, len(fetched), len(errors), len(missing), highlight_time_ms,
+        )
         return HighlightBatchResponse(
             status=status,
             cached_count=cached_count,
diff --git a/backend/app/services/llm_client.py b/backend/app/services/llm_client.py
index e54c454..a5553b0 100644
--- a/backend/app/services/llm_client.py
+++ b/backend/app/services/llm_client.py
@@ -1,6 +1,7 @@
 import json
 import logging
 import os
+import re
 import time
 from typing import Any, Optional
 
@@ -137,58 +138,103 @@ class LLMClient:
 
         return await self._complete_structured_openai(prompt, pydantic_model, step_name, start_time)
 
+    def _strip_markdown_fence(self, content: str) -> str:
+        """Strip markdown code fences from LLM output if present.
+
+        Some models (especially vLLM) may wrap JSON output in ```json ... ``` fences
+        even when structured output is requested.
+        """
+        match = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", content, re.DOTALL)
+        if match:
+            return match.group(1).strip()
+        return content
+
     async def _complete_structured_vllm(self, prompt: str, pydantic_model, step_name: str, start_time: float):
-        """Use vLLM's native guided_json via extra_body for structured output."""
+        """Use vLLM-native structured output via response_format or structured_outputs extra_body.
+
+        Tier 1 (recommended): OpenAI-native response_format (vLLM v0.6.4+)
+          - Portable across OpenRouter, OpenAI, and vLLM
+          - vLLM RFC #19097 confirms this is the future direction
+
+        Tier 2 (fallback): extra_body structured_outputs (vLLM v0.8+)
+          - Battle-tested fallback for v0.8+ deployments
+
+        guided_json is deliberately removed — it was removed in vLLM v0.12.0.
+        """
         schema = pydantic_model.model_json_schema()
+        model_name = pydantic_model.__name__
         prompt_preview = self._truncate_prompt_for_log(prompt, first_chars=300, last_chars=100)
         self.logger.info(
             "[%s] vLLM structured: prompt=%s schema=%s",
             step_name, prompt_preview, json.dumps(schema)[:300],
         )
 
-        # Merge thinking-control params so vLLM structured calls
-        # also respect enable_thinking/vllm_engine config (was missing).
-        body = self._build_extra_body()
+        # ------------------------------------------------------------------
+        # Tier 1: OpenAI-native response_format (vLLM v0.6.4+)
+        # ------------------------------------------------------------------
+        # vLLM's protocol.py to_sampling_params() converts this to
+        # StructuredOutputsParams since PR #7654 (Aug 2024). RFC #19097
+        # confirms this will become the primary structured output API.
+        try:
+            response = await self._client.chat.completions.create(
+                model=self.model,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.0,
+                response_format={
+                    "type": "json_schema",
+                    "json_schema": {"name": model_name, "schema": schema},
+                },
+            )
+            content = response.choices[0].message.content or ""
+            content = self._strip_markdown_fence(content)
+            elapsed_ms = (time.perf_counter() - start_time) * 1000
+            self.logger.info(
+                "[%s] vLLM structured succeeded via response_format in %.2fms "
+                "tokens=(%s/%s)",
+                step_name, elapsed_ms,
+                response.usage.prompt_tokens if response.usage else "?",
+                response.usage.completion_tokens if response.usage else "?",
+            )
+            return pydantic_model.model_validate_json(content)
+        except Exception as exc:
+            elapsed_ms = (time.perf_counter() - start_time) * 1000
+            self.logger.warning(
+                "[%s] response_format failed after %.2fms: %s. "
+                "Trying structured_outputs fallback.",
+                step_name, elapsed_ms, exc,
+            )
 
-        # Try the new unified format first, then legacy guided_json
-        for fmt_name, base_extra in [
-            ("structured_outputs", {"structured_outputs": {"json": schema}}),
-            ("guided_json", {"guided_json": schema}),
-        ]:
-            extra = {**base_extra, **body}
-            try:
-                self.logger.info("[%s] vLLM structured: trying format=%s extra=%s", step_name, fmt_name, extra)
-                response = await self._client.chat.completions.create(
-                    model=self.model,
-                    messages=[{"role": "user", "content": prompt}],
-                    temperature=0.0,
-                    extra_body=extra,
-                )
-                self.logger.info("[%s] vLLM structured full response: %s", step_name, response)
-                content = response.choices[0].message.content or ""
-                elapsed_ms = (time.perf_counter() - start_time) * 1000
-                self.logger.info(
-                    "[%s] vLLM structured succeeded format=%s in %.2fms tokens=(%s/%s). Parsed=%s",
-                    step_name, fmt_name, elapsed_ms,
-                    response.usage.prompt_tokens if response.usage else "?",
-                    response.usage.completion_tokens if response.usage else "?",
-                    content,
-                )
-                return pydantic_model.model_validate_json(content)
-            except Exception as exc:
-                elapsed_ms = (time.perf_counter() - start_time) * 1000
-                self.logger.warning(
-                    "[%s] vLLM structured format=%s failed after %.2fms: %s",
-                    step_name, fmt_name, elapsed_ms, exc,
-                    exc_info=True,
-                )
+        # ------------------------------------------------------------------
+        # Tier 2: extra_body structured_outputs (vLLM v0.8+)
+        # ------------------------------------------------------------------
+        # This is the battle-tested format with extensive test coverage in
+        # vLLM's test_chat.py. Used by multiple real-world projects.
+        try:
+            response = await self._client.chat.completions.create(
+                model=self.model,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.0,
+                extra_body={"structured_outputs": {"json": schema}},
+            )
+            content = response.choices[0].message.content or ""
+            content = self._strip_markdown_fence(content)
+            elapsed_ms = (time.perf_counter() - start_time) * 1000
+            self.logger.info(
+                "[%s] vLLM structured succeeded via structured_outputs in %.2fms "
+                "tokens=(%s/%s)",
+                step_name, elapsed_ms,
+                response.usage.prompt_tokens if response.usage else "?",
+                response.usage.completion_tokens if response.usage else "?",
+            )
+            return pydantic_model.model_validate_json(content)
+        except Exception as exc:
+            elapsed_ms = (time.perf_counter() - start_time) * 1000
+            self.logger.error(
+                "[%s] Both response_format and structured_outputs failed after %.2fms",
+                step_name, elapsed_ms, exc_info=True,
+            )
 
-        elapsed_ms = (time.perf_counter() - start_time) * 1000
-        self.logger.error(
-            "[%s] vLLM structured: all formats failed after %.2fms", step_name, elapsed_ms,
-            exc_info=True,
-        )
-        raise LLMClientError("vLLM structured output failed with all guided formats")
+        raise LLMClientError("vLLM structured output failed with all supported formats")
 
     async def _complete_structured_openai(self, prompt: str, pydantic_model, step_name: str, start_time: float):
         """Use OpenAI-native json_schema via LangChain's with_structured_output()."""
diff --git a/frontend/src/components/ResponsePanel.tsx b/frontend/src/components/ResponsePanel.tsx
index b93c6ba..b6050a3 100644
--- a/frontend/src/components/ResponsePanel.tsx
+++ b/frontend/src/components/ResponsePanel.tsx
@@ -204,6 +204,12 @@ function SubQuestionSections({
     
     setHighlightStatus('loading')
     
+    console.log(
+      `[Highlight] Sending batch request: ${targets.length} targets`,
+      targets.map((t: { document_id: string; chunk_index: number }) => `${t.document_id}#${t.chunk_index}`),
+    )
+    const startTime = performance.now()
+    
     const url = historyId
       ? `http://localhost:8000/api/v1/v2/highlights/batch?history_id=${historyId}`
       : 'http://localhost:8000/api/v1/v2/highlights/batch'
@@ -213,10 +219,23 @@ function SubQuestionSections({
       body: JSON.stringify({ targets }),
     })
       .then((res) => res.json())
-      .then((data) => {
+      .then((data: { status: string; cached_count?: number; errors?: string[]; highlight_time_ms?: number }) => {
+        const elapsed = Math.round(performance.now() - startTime)
+        console.log(
+          `[Highlight] Response: status=${data.status} cached=${data.cached_count ?? '?'} ` +
+          `errors=${data.errors?.length ?? 0} backend_time=${data.highlight_time_ms ?? '?'}ms total=${elapsed}ms`,
+        )
+        if (data.errors?.length) {
+          console.warn('[Highlight] Errors:', data.errors)
+        }
+        if (data.status === 'failed') {
+          console.error('[Highlight] Batch computation failed:', data.errors)
+          setHighlightStatus('')
+          return
+        }
         if (data.status === 'completed' || data.status === 'partial') {
           const keys = new Set<string>()
-          targets.forEach((t) => {
+          targets.forEach((t: { document_id: string; chunk_index: number; sub_question_text: string }) => {
             keys.add(`${t.document_id}_${t.chunk_index}_${encodeURIComponent(t.sub_question_text)}`)
           })
           setHighlightReadyKeys(keys)
@@ -226,6 +245,7 @@ function SubQuestionSections({
       })
       .catch((err) => {
         console.error('Highlight batch computation failed:', err)
+        setHighlightStatus('')
       })
   }, [answer, isLoading, subQuestionSources, historyId])