feat: add LLM-based batch highlight service and HTML rendering (Phase 5.4.4)

- ChunkHighlightService.compute_highlights_batch(): single LLM call across all cited chunks, grouped by sub-question, with structured output - render_highlight_html(): self-contained HTML page with yellow-highlighted relevant sentences, LLM reason annotations, and View Original PDF footer - Per-target error isolation, ChromaDB miss handling, graceful degradation - 14 tests: 7 batch service + 7 HTML rendering
2026-04-29 09:26:33 +08:00 · 2026-04-29 09:26:33 +08:00 · c6d4a38013
parent bdbc8ea1a0
commit c6d4a38013
2 changed files with 622 additions and 0 deletions
--- a/backend/app/services/chunk_highlight_service.py
+++ b/backend/app/services/chunk_highlight_service.py
@ -0,0 +1,231 @@
+"""Chunk highlight service — batch LLM highlight computation and HTML rendering."""
+
+import json
+import logging
+from collections import defaultdict
+from typing import Any
+
+from app.models.highlight import (
+    ChunkHighlightTarget,
+    ChunkHighlights,
+    HighlightBatchResponse,
+    HighlightBatchResult,
+    RelevantSentence,
+)
+from app.services.highlight_cache import compute_cache_key
+from app.utils.sentence_splitter import split_sentences
+
+logger = logging.getLogger(__name__)
+
+
+def render_highlight_html(
+    chunk_text: str,
+    sentences: list[str],
+    relevant_sentences: list[RelevantSentence],
+    metadata: dict[str, Any],
+) -> str:
+    highlighted_indices = {rs.sentence_index for rs in relevant_sentences}
+    index_to_reason: dict[int, str] = {rs.sentence_index: rs.reason for rs in relevant_sentences}
+
+    filename = metadata.get("filename", "Unknown")
+    page_number = metadata.get("page_number")
+    chunk_file_path = metadata.get("chunk_file_path")
+    sub_question = metadata.get("sub_question", "")
+    chunk_index = metadata.get("chunk_index", 0)
+
+    parts: list[str] = []
+    parts.append("<!DOCTYPE html>")
+    parts.append("<html>")
+    parts.append("<head>")
+    parts.append('<meta charset="utf-8">')
+    parts.append("<style>")
+    parts.append("body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; margin: 24px; color: #1e293b; line-height: 1.7; }")
+    parts.append(".highlighted { background-color: #fef08a; padding: 2px 4px; border-radius: 3px; }")
+    parts.append(".reason { color: #6b7280; font-style: italic; font-size: 0.9em; margin-left: 12px; }")
+    parts.append(".header { margin-bottom: 16px; }")
+    parts.append(".sub-header { color: #475569; font-size: 0.95em; margin: 4px 0; }")
+    parts.append(".sentence { margin: 8px 0; }")
+    parts.append(".footer { margin-top: 24px; padding-top: 12px; border-top: 1px solid #e2e8f0; }")
+    parts.append(".footer a { color: #2563eb; text-decoration: none; }")
+    parts.append(".footer a:hover { text-decoration: underline; }")
+    parts.append("</style>")
+    parts.append("</head>")
+    parts.append("<body>")
+
+    parts.append('<div class="header">')
+    parts.append(f"<h2>{filename} — Chunk {chunk_index}</h2>")
+    if page_number is not None:
+        parts.append(f'<p class="sub-header">Page {page_number}</p>')
+    if sub_question:
+        parts.append(f'<p class="sub-header">Sub-question: {sub_question}</p>')
+    parts.append("</div>")
+
+    for i, sentence in enumerate(sentences):
+        if i in highlighted_indices:
+            reason = index_to_reason.get(i, "")
+            parts.append('<div class="sentence">')
+            parts.append(f'<span class="highlighted">{sentence}</span>')
+            if reason:
+                parts.append(f'<span class="reason">{reason}</span>')
+            parts.append("</div>")
+        else:
+            parts.append(f'<p class="sentence">{sentence}</p>')
+
+    if chunk_file_path:
+        parts.append('<div class="footer">')
+        parts.append(f'<a href="/api/v1/chunks/view?file={chunk_file_path}">View Original PDF →</a>')
+        parts.append("</div>")
+
+    parts.append("</body>")
+    parts.append("</html>")
+
+    return "\n".join(parts)
+
+
+class ChunkHighlightService:
+
+    def __init__(self, rag_service, llm_client, highlight_cache, settings):
+        self._rag = rag_service
+        self._llm = llm_client
+        self._cache = highlight_cache
+        self._settings = settings
+
+    async def compute_highlights_batch(
+        self,
+        targets: list[ChunkHighlightTarget],
+    ) -> HighlightBatchResponse:
+        if not targets:
+            return HighlightBatchResponse(status="completed", cached_count=0)
+
+        errors: list[str] = []
+        fetched: list[tuple[ChunkHighlightTarget, list[str], dict[str, Any]]] = []
+
+        for target in targets:
+            chunk_id = f"{target.document_id}_{target.chunk_index}"
+            try:
+                result = self._rag.collection.get(
+                    ids=[chunk_id],
+                    include=["documents", "metadatas"],
+                )
+                if not result.get("documents"):
+                    logger.warning("No documents returned for chunk_id=%s, skipping.", chunk_id)
+                    continue
+                chunk_text = result["documents"][0]
+                metadata = result["metadatas"][0] if result.get("metadatas") else {}
+                metadata["sub_question"] = target.sub_question_text
+                metadata["chunk_index"] = target.chunk_index
+                sentences = split_sentences(chunk_text)
+                fetched.append((target, sentences, metadata))
+            except Exception as exc:
+                msg = f"Failed to fetch chunk {chunk_id}: {exc}"
+                logger.error(msg)
+                errors.append(msg)
+
+        if not fetched:
+            return HighlightBatchResponse(
+                status="completed" if not errors else "partial",
+                cached_count=0,
+                errors=errors,
+            )
+
+        prompt = self._build_prompt(fetched)
+
+        try:
+            llm_result: HighlightBatchResult = await self._llm.complete_structured(
+                prompt, HighlightBatchResult, step_name="HighlightBatch"
+            )
+        except Exception as exc:
+            logger.error("HighlightBatch LLM call failed: %s", exc)
+            return HighlightBatchResponse(
+                status="failed", cached_count=0, errors=[str(exc)]
+            )
+
+        cached_count = self._cache_results(fetched, llm_result)
+
+        result_ids = {(r.document_id, r.chunk_index) for r in llm_result.results}
+        fetched_ids = {(t.document_id, t.chunk_index) for t, _, _ in fetched}
+        missing = fetched_ids - result_ids
+
+        if errors or missing:
+            for doc_id, chunk_idx in missing:
+                errors.append(f"No highlight result for {doc_id}_{chunk_idx}")
+
+        status = "partial" if (errors or missing) else "completed"
+        return HighlightBatchResponse(
+            status=status,
+            cached_count=cached_count,
+            errors=errors,
+        )
+
+    def _build_prompt(
+        self,
+        fetched: list[tuple[ChunkHighlightTarget, list[str], dict[str, Any]]],
+    ) -> str:
+        by_sub_q: dict[int, list[tuple[ChunkHighlightTarget, list[str]]]] = defaultdict(list)
+        for target, sentences, _meta in fetched:
+            by_sub_q[target.sub_question_index].append((target, sentences))
+
+        lines: list[str] = [
+            "For each sub-question below, identify which sentences in each cited chunk are relevant to answering that sub-question.",
+            "Return a HighlightBatchResult with a results list containing one ChunkHighlights per (document_id, chunk_index) pair.",
+            "Each ChunkHighlights should list the relevant sentence indices (0-based) with a brief reason (max 80 chars).",
+            "",
+        ]
+
+        for sq_idx in sorted(by_sub_q.keys()):
+            items = by_sub_q[sq_idx]
+            sub_q_text = items[0][0].sub_question_text
+            lines.append(f"## Sub-question {sq_idx}: {sub_q_text}")
+            lines.append("")
+            for target, sentences in items:
+                lines.append(f"### Chunk: document_id={target.document_id}, chunk_index={target.chunk_index}")
+                for i, s in enumerate(sentences):
+                    lines.append(f"[{i}] {s}")
+                lines.append("")
+
+        return "\n".join(lines)
+
+    def _cache_results(
+        self,
+        fetched: list[tuple[ChunkHighlightTarget, list[str], dict[str, Any]]],
+        llm_result: HighlightBatchResult,
+    ) -> int:
+        lookup: dict[tuple[str, int], tuple[ChunkHighlightTarget, list[str], dict[str, Any]]] = {
+            (t.document_id, t.chunk_index): (t, s, m)
+            for t, s, m in fetched
+        }
+
+        cached_count = 0
+        for chunk_hl in llm_result.results:
+            key = (chunk_hl.document_id, chunk_hl.chunk_index)
+            entry = lookup.get(key)
+            if entry is None:
+                continue
+
+            target, sentences, metadata = entry
+            html = render_highlight_html(
+                chunk_text=" ".join(sentences),
+                sentences=sentences,
+                relevant_sentences=chunk_hl.relevant_sentences,
+                metadata=metadata,
+            )
+
+            cache_key = compute_cache_key(
+                target.document_id,
+                target.chunk_index,
+                target.sub_question_text,
+            )
+            self._cache.set_highlight(
+                cache_key=cache_key,
+                document_id=target.document_id,
+                chunk_index=target.chunk_index,
+                sub_question=target.sub_question_text,
+                relevant_sentences_json=json.dumps(
+                    [rs.model_dump() for rs in chunk_hl.relevant_sentences],
+                    default=str,
+                ),
+                html_content=html,
+            )
+            cached_count += 1
+
+        return cached_count
--- a/backend/app/test/test_phase5_chunk_highlight_service.py
+++ b/backend/app/test/test_phase5_chunk_highlight_service.py
@ -0,0 +1,391 @@
+"""Tests for ChunkHighlightService — compute_highlights_batch and render_highlight_html.
+
+Coverage:
+- compute_highlights_batch: valid targets, empty targets, per-target error isolation,
+  ChromaDB miss, LLM failure, partial results
+- render_highlight_html: highlighted sentences, non-highlighted sentences, reason text,
+  footer link, no footer link, empty relevant_sentences, valid HTML structure
+"""
+import json
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from app.models.highlight import (
+    ChunkHighlightTarget,
+    ChunkHighlights,
+    HighlightBatchResult,
+    RelevantSentence,
+)
+from app.services.chunk_highlight_service import ChunkHighlightService, render_highlight_html
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_target(doc_id="doc1", chunk_idx=0, sub_q="What is X?", sub_q_idx=0):
+    return ChunkHighlightTarget(
+        document_id=doc_id,
+        chunk_index=chunk_idx,
+        sub_question_text=sub_q,
+        sub_question_index=sub_q_idx,
+    )
+
+
+def _make_metadata(
+    filename="test.pdf",
+    page_number=1,
+    chunk_file_path="/chunks/page_1.txt",
+    document_id="doc1",
+    chunk_index=0,
+    sub_question="What is X?",
+):
+    return {
+        "filename": filename,
+        "page_number": page_number,
+        "chunk_file_path": chunk_file_path,
+        "document_id": document_id,
+        "chunk_index": chunk_index,
+        "sub_question": sub_question,
+    }
+
+
+def _mock_collection(get_return=None):
+    """Return a MagicMock collection whose .get() returns get_return."""
+    collection = MagicMock()
+    collection.get.return_value = get_return or {
+        "ids": ["doc1_0"],
+        "documents": ["This is sentence one. This is sentence two. This is sentence three."],
+        "metadatas": [_make_metadata()],
+    }
+    return collection
+
+
+def _mock_llm(result: HighlightBatchResult):
+    llm = MagicMock()
+    llm.complete_structured = AsyncMock(return_value=result)
+    return llm
+
+
+def _mock_cache():
+    """In-memory dict-backed cache mock."""
+    store = {}
+
+    cache = MagicMock()
+    cache.get_highlight.side_effect = lambda key: store.get(key)
+    cache.set_highlight.side_effect = lambda **kwargs: store.update({kwargs["cache_key"]: kwargs["html_content"]})
+    # expose store for assertions
+    cache._store = store
+    return cache
+
+
+def _mock_settings():
+    s = MagicMock()
+    s.document_chunk_path = "/data/chunks"
+    return s
+
+
+def _service(collection=None, llm=None, cache=None, settings=None):
+    rag = MagicMock()
+    rag.collection = collection or _mock_collection()
+
+    return ChunkHighlightService(
+        rag_service=rag,
+        llm_client=llm or _mock_llm(
+            HighlightBatchResult(
+                results=[
+                    ChunkHighlights(
+                        document_id="doc1",
+                        chunk_index=0,
+                        relevant_sentences=[
+                            RelevantSentence(sentence_index=0, reason="Defines X"),
+                            RelevantSentence(sentence_index=2, reason="Explains X"),
+                        ],
+                    )
+                ]
+            )
+        ),
+        highlight_cache=cache or _mock_cache(),
+        settings=settings or _mock_settings(),
+    )
+
+
+# ===========================================================================
+# compute_highlights_batch tests
+# ===========================================================================
+
+
+class TestComputeHighlightsBatch:
+    """Tests for ChunkHighlightService.compute_highlights_batch."""
+
+    @pytest.mark.asyncio
+    async def test_valid_targets_completed(self):
+        """LLM returns structured result for all targets → status completed."""
+        svc = _service()
+        targets = [_make_target()]
+        resp = await svc.compute_highlights_batch(targets)
+
+        assert resp.status == "completed"
+        assert resp.cached_count == 1
+        assert resp.errors == []
+
+    @pytest.mark.asyncio
+    async def test_empty_targets(self):
+        """Empty target list → completed with cached_count=0, no LLM call."""
+        svc = _service()
+        resp = await svc.compute_highlights_batch([])
+
+        assert resp.status == "completed"
+        assert resp.cached_count == 0
+        svc._llm.complete_structured.assert_not_called()
+
+    @pytest.mark.asyncio
+    async def test_per_target_error_isolation(self):
+        """One target's ChromaDB fetch fails → other 2 succeed, errors non-empty."""
+        collection = MagicMock()
+
+        def side_effect(ids, include):
+            chunk_id = ids[0]
+            if chunk_id == "doc1_0":
+                return {
+                    "ids": [chunk_id],
+                    "documents": ["Sentence A. Sentence B."],
+                    "metadatas": [_make_metadata(document_id="doc1", chunk_index=0)],
+                }
+            elif chunk_id == "doc2_0":
+                raise RuntimeError("ChromaDB boom")
+            else:
+                return {
+                    "ids": [chunk_id],
+                    "documents": ["Sentence C. Sentence D."],
+                    "metadatas": [_make_metadata(
+                        document_id="doc3", chunk_index=0,
+                        filename="c.pdf", sub_question="What is C?",
+                    )],
+                }
+
+        collection.get.side_effect = side_effect
+
+        llm_result = HighlightBatchResult(
+            results=[
+                ChunkHighlights(
+                    document_id="doc1",
+                    chunk_index=0,
+                    relevant_sentences=[RelevantSentence(sentence_index=0, reason="Relevant")],
+                ),
+                ChunkHighlights(
+                    document_id="doc3",
+                    chunk_index=0,
+                    relevant_sentences=[RelevantSentence(sentence_index=1, reason="Relevant")],
+                ),
+            ]
+        )
+        cache = _mock_cache()
+        svc = _service(collection=collection, llm=_mock_llm(llm_result), cache=cache)
+
+        targets = [
+            _make_target(doc_id="doc1", chunk_idx=0, sub_q="Q1", sub_q_idx=0),
+            _make_target(doc_id="doc2", chunk_idx=0, sub_q="Q2", sub_q_idx=1),
+            _make_target(doc_id="doc3", chunk_idx=0, sub_q="Q3", sub_q_idx=2),
+        ]
+        resp = await svc.compute_highlights_batch(targets)
+
+        assert resp.cached_count == 2
+        assert len(resp.errors) >= 1
+        assert resp.status == "partial"
+
+    @pytest.mark.asyncio
+    async def test_chromadb_miss_skips_target(self):
+        """collection.get returns empty documents → target skipped."""
+        collection = MagicMock()
+        collection.get.return_value = {"ids": [], "documents": [], "metadatas": []}
+
+        llm = _mock_llm(HighlightBatchResult(results=[]))
+        cache = _mock_cache()
+        svc = _service(collection=collection, llm=llm, cache=cache)
+
+        targets = [_make_target()]
+        resp = await svc.compute_highlights_batch(targets)
+
+        # No chunks fetched → nothing to send to LLM → completed with 0 cached
+        assert resp.cached_count == 0
+
+    @pytest.mark.asyncio
+    async def test_llm_fails_entirely(self):
+        """LLM call raises → status failed, cached_count=0."""
+        llm = MagicMock()
+        llm.complete_structured = AsyncMock(side_effect=Exception("LLM down"))
+
+        svc = _service(llm=llm)
+        resp = await svc.compute_highlights_batch([_make_target()])
+
+        assert resp.status == "failed"
+        assert resp.cached_count == 0
+        assert any("LLM down" in e for e in resp.errors)
+
+    @pytest.mark.asyncio
+    async def test_llm_returns_results_for_each_target(self):
+        """LLM returns ChunkHighlights matching each target."""
+        collection = MagicMock()
+
+        def side_effect(ids, include):
+            did = ids[0]
+            return {
+                "ids": [did],
+                "documents": ["Alpha. Beta. Gamma."],
+                "metadatas": [_make_metadata(document_id=did.split("_")[0], chunk_index=int(did.split("_")[1]))],
+            }
+
+        collection.get.side_effect = side_effect
+
+        llm_result = HighlightBatchResult(
+            results=[
+                ChunkHighlights(document_id="d1", chunk_index=0,
+                                relevant_sentences=[RelevantSentence(sentence_index=0, reason="R1")]),
+                ChunkHighlights(document_id="d2", chunk_index=0,
+                                relevant_sentences=[RelevantSentence(sentence_index=1, reason="R2")]),
+            ]
+        )
+        cache = _mock_cache()
+        svc = _service(collection=collection, llm=_mock_llm(llm_result), cache=cache)
+
+        targets = [
+            _make_target(doc_id="d1", chunk_idx=0, sub_q="SQ1", sub_q_idx=0),
+            _make_target(doc_id="d2", chunk_idx=0, sub_q="SQ2", sub_q_idx=1),
+        ]
+        resp = await svc.compute_highlights_batch(targets)
+
+        assert resp.cached_count == 2
+        assert resp.status == "completed"
+
+    @pytest.mark.asyncio
+    async def test_llm_returns_fewer_results_than_targets_partial(self):
+        """LLM returns fewer ChunkHighlights than targets → status partial."""
+        collection = MagicMock()
+
+        def side_effect(ids, include):
+            did = ids[0]
+            return {
+                "ids": [did],
+                "documents": ["Alpha. Beta."],
+                "metadatas": [_make_metadata(document_id=did.split("_")[0], chunk_index=int(did.split("_")[1]))],
+            }
+
+        collection.get.side_effect = side_effect
+
+        # Only 1 result for 3 targets
+        llm_result = HighlightBatchResult(
+            results=[
+                ChunkHighlights(document_id="d1", chunk_index=0,
+                                relevant_sentences=[RelevantSentence(sentence_index=0, reason="R1")]),
+            ]
+        )
+        cache = _mock_cache()
+        svc = _service(collection=collection, llm=_mock_llm(llm_result), cache=cache)
+
+        targets = [
+            _make_target(doc_id="d1", chunk_idx=0, sub_q="SQ1", sub_q_idx=0),
+            _make_target(doc_id="d2", chunk_idx=0, sub_q="SQ2", sub_q_idx=1),
+            _make_target(doc_id="d3", chunk_idx=0, sub_q="SQ3", sub_q_idx=2),
+        ]
+        resp = await svc.compute_highlights_batch(targets)
+
+        assert resp.status == "partial"
+        assert resp.cached_count == 1
+
+
+# ===========================================================================
+# render_highlight_html tests
+# ===========================================================================
+
+
+class TestRenderHighlightHtml:
+    """Tests for the render_highlight_html standalone function."""
+
+    def test_highlighted_sentences_have_yellow_background(self):
+        """Highlighted sentences should have yellow background CSS."""
+        html = render_highlight_html(
+            chunk_text="Alpha. Beta. Gamma.",
+            sentences=["Alpha.", "Beta.", "Gamma."],
+            relevant_sentences=[RelevantSentence(sentence_index=1, reason="Key")],
+            metadata=_make_metadata(),
+        )
+        assert "highlighted" in html
+        assert "#fef08a" in html
+        # "Beta." should be inside highlighted span
+        assert "Beta." in html
+
+    def test_non_highlighted_no_yellow(self):
+        """Non-highlighted sentences should NOT have highlighted class."""
+        html = render_highlight_html(
+            chunk_text="Alpha. Beta. Gamma.",
+            sentences=["Alpha.", "Beta.", "Gamma."],
+            relevant_sentences=[RelevantSentence(sentence_index=1, reason="Key")],
+            metadata=_make_metadata(),
+        )
+        # Alpha. and Gamma. should appear but not inside a highlighted span
+        # Check that Alpha appears in a plain <p> context, not inside .highlighted
+        assert "Alpha." in html
+        assert "Gamma." in html
+        # Verify Beta is highlighted but Alpha is not
+        assert 'class="highlighted"' in html
+
+    def test_reason_text_shown_below_highlighted(self):
+        """Reason text should appear below each highlighted sentence."""
+        html = render_highlight_html(
+            chunk_text="Alpha. Beta.",
+            sentences=["Alpha.", "Beta."],
+            relevant_sentences=[RelevantSentence(sentence_index=0, reason="Defines the concept")],
+            metadata=_make_metadata(),
+        )
+        assert "reason" in html
+        assert "Defines the concept" in html
+
+    def test_footer_link_with_chunk_file_path(self):
+        """When chunk_file_path is present, footer has View Original PDF link."""
+        html = render_highlight_html(
+            chunk_text="Some text.",
+            sentences=["Some text."],
+            relevant_sentences=[],
+            metadata=_make_metadata(chunk_file_path="/chunks/page_1.txt"),
+        )
+        assert "View Original PDF" in html
+        assert 'href="/api/v1/chunks/view?file=/chunks/page_1.txt"' in html
+
+    def test_no_footer_link_when_chunk_file_path_none(self):
+        """When chunk_file_path is None, no footer link rendered."""
+        html = render_highlight_html(
+            chunk_text="Some text.",
+            sentences=["Some text."],
+            relevant_sentences=[],
+            metadata=_make_metadata(chunk_file_path=None),
+        )
+        assert "View Original PDF" not in html
+
+    def test_empty_relevant_sentences_all_plain(self):
+        """Empty relevant_sentences → all text plain, no yellow backgrounds."""
+        html = render_highlight_html(
+            chunk_text="Alpha. Beta.",
+            sentences=["Alpha.", "Beta."],
+            relevant_sentences=[],
+            metadata=_make_metadata(),
+        )
+        assert 'class="highlighted"' not in html
+        assert "Alpha." in html
+        assert "Beta." in html
+
+    def test_html_valid_self_contained(self):
+        """HTML page is valid and self-contained (DOCTYPE, head, body)."""
+        html = render_highlight_html(
+            chunk_text="Some text.",
+            sentences=["Some text."],
+            relevant_sentences=[],
+            metadata=_make_metadata(),
+        )
+        assert "<!DOCTYPE html>" in html
+        assert "<head>" in html
+        assert "</head>" in html
+        assert "<body>" in html
+        assert "</body>" in html
+        assert "charset" in html