diff --git a/backend/app/services/chunk_highlight_service.py b/backend/app/services/chunk_highlight_service.py new file mode 100644 index 0000000..f5715d5 --- /dev/null +++ b/backend/app/services/chunk_highlight_service.py @@ -0,0 +1,231 @@ +"""Chunk highlight service — batch LLM highlight computation and HTML rendering.""" + +import json +import logging +from collections import defaultdict +from typing import Any + +from app.models.highlight import ( + ChunkHighlightTarget, + ChunkHighlights, + HighlightBatchResponse, + HighlightBatchResult, + RelevantSentence, +) +from app.services.highlight_cache import compute_cache_key +from app.utils.sentence_splitter import split_sentences + +logger = logging.getLogger(__name__) + + +def render_highlight_html( + chunk_text: str, + sentences: list[str], + relevant_sentences: list[RelevantSentence], + metadata: dict[str, Any], +) -> str: + highlighted_indices = {rs.sentence_index for rs in relevant_sentences} + index_to_reason: dict[int, str] = {rs.sentence_index: rs.reason for rs in relevant_sentences} + + filename = metadata.get("filename", "Unknown") + page_number = metadata.get("page_number") + chunk_file_path = metadata.get("chunk_file_path") + sub_question = metadata.get("sub_question", "") + chunk_index = metadata.get("chunk_index", 0) + + parts: list[str] = [] + parts.append("") + parts.append("") + parts.append("
") + parts.append('') + parts.append("") + parts.append("") + parts.append("") + + parts.append('Page {page_number}
') + if sub_question: + parts.append(f'Sub-question: {sub_question}
') + parts.append("{sentence}
') + + if chunk_file_path: + parts.append('") + + parts.append("") + parts.append("") + + return "\n".join(parts) + + +class ChunkHighlightService: + + def __init__(self, rag_service, llm_client, highlight_cache, settings): + self._rag = rag_service + self._llm = llm_client + self._cache = highlight_cache + self._settings = settings + + async def compute_highlights_batch( + self, + targets: list[ChunkHighlightTarget], + ) -> HighlightBatchResponse: + if not targets: + return HighlightBatchResponse(status="completed", cached_count=0) + + errors: list[str] = [] + fetched: list[tuple[ChunkHighlightTarget, list[str], dict[str, Any]]] = [] + + for target in targets: + chunk_id = f"{target.document_id}_{target.chunk_index}" + try: + result = self._rag.collection.get( + ids=[chunk_id], + include=["documents", "metadatas"], + ) + if not result.get("documents"): + logger.warning("No documents returned for chunk_id=%s, skipping.", chunk_id) + continue + chunk_text = result["documents"][0] + metadata = result["metadatas"][0] if result.get("metadatas") else {} + metadata["sub_question"] = target.sub_question_text + metadata["chunk_index"] = target.chunk_index + sentences = split_sentences(chunk_text) + fetched.append((target, sentences, metadata)) + except Exception as exc: + msg = f"Failed to fetch chunk {chunk_id}: {exc}" + logger.error(msg) + errors.append(msg) + + if not fetched: + return HighlightBatchResponse( + status="completed" if not errors else "partial", + cached_count=0, + errors=errors, + ) + + prompt = self._build_prompt(fetched) + + try: + llm_result: HighlightBatchResult = await self._llm.complete_structured( + prompt, HighlightBatchResult, step_name="HighlightBatch" + ) + except Exception as exc: + logger.error("HighlightBatch LLM call failed: %s", exc) + return HighlightBatchResponse( + status="failed", cached_count=0, errors=[str(exc)] + ) + + cached_count = self._cache_results(fetched, llm_result) + + result_ids = {(r.document_id, r.chunk_index) for r in llm_result.results} + fetched_ids = {(t.document_id, t.chunk_index) for t, _, _ in fetched} + missing = fetched_ids - result_ids + + if errors or missing: + for doc_id, chunk_idx in missing: + errors.append(f"No highlight result for {doc_id}_{chunk_idx}") + + status = "partial" if (errors or missing) else "completed" + return HighlightBatchResponse( + status=status, + cached_count=cached_count, + errors=errors, + ) + + def _build_prompt( + self, + fetched: list[tuple[ChunkHighlightTarget, list[str], dict[str, Any]]], + ) -> str: + by_sub_q: dict[int, list[tuple[ChunkHighlightTarget, list[str]]]] = defaultdict(list) + for target, sentences, _meta in fetched: + by_sub_q[target.sub_question_index].append((target, sentences)) + + lines: list[str] = [ + "For each sub-question below, identify which sentences in each cited chunk are relevant to answering that sub-question.", + "Return a HighlightBatchResult with a results list containing one ChunkHighlights per (document_id, chunk_index) pair.", + "Each ChunkHighlights should list the relevant sentence indices (0-based) with a brief reason (max 80 chars).", + "", + ] + + for sq_idx in sorted(by_sub_q.keys()): + items = by_sub_q[sq_idx] + sub_q_text = items[0][0].sub_question_text + lines.append(f"## Sub-question {sq_idx}: {sub_q_text}") + lines.append("") + for target, sentences in items: + lines.append(f"### Chunk: document_id={target.document_id}, chunk_index={target.chunk_index}") + for i, s in enumerate(sentences): + lines.append(f"[{i}] {s}") + lines.append("") + + return "\n".join(lines) + + def _cache_results( + self, + fetched: list[tuple[ChunkHighlightTarget, list[str], dict[str, Any]]], + llm_result: HighlightBatchResult, + ) -> int: + lookup: dict[tuple[str, int], tuple[ChunkHighlightTarget, list[str], dict[str, Any]]] = { + (t.document_id, t.chunk_index): (t, s, m) + for t, s, m in fetched + } + + cached_count = 0 + for chunk_hl in llm_result.results: + key = (chunk_hl.document_id, chunk_hl.chunk_index) + entry = lookup.get(key) + if entry is None: + continue + + target, sentences, metadata = entry + html = render_highlight_html( + chunk_text=" ".join(sentences), + sentences=sentences, + relevant_sentences=chunk_hl.relevant_sentences, + metadata=metadata, + ) + + cache_key = compute_cache_key( + target.document_id, + target.chunk_index, + target.sub_question_text, + ) + self._cache.set_highlight( + cache_key=cache_key, + document_id=target.document_id, + chunk_index=target.chunk_index, + sub_question=target.sub_question_text, + relevant_sentences_json=json.dumps( + [rs.model_dump() for rs in chunk_hl.relevant_sentences], + default=str, + ), + html_content=html, + ) + cached_count += 1 + + return cached_count diff --git a/backend/app/test/test_phase5_chunk_highlight_service.py b/backend/app/test/test_phase5_chunk_highlight_service.py new file mode 100644 index 0000000..c22978a --- /dev/null +++ b/backend/app/test/test_phase5_chunk_highlight_service.py @@ -0,0 +1,391 @@ +"""Tests for ChunkHighlightService — compute_highlights_batch and render_highlight_html. + +Coverage: +- compute_highlights_batch: valid targets, empty targets, per-target error isolation, + ChromaDB miss, LLM failure, partial results +- render_highlight_html: highlighted sentences, non-highlighted sentences, reason text, + footer link, no footer link, empty relevant_sentences, valid HTML structure +""" +import json +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from app.models.highlight import ( + ChunkHighlightTarget, + ChunkHighlights, + HighlightBatchResult, + RelevantSentence, +) +from app.services.chunk_highlight_service import ChunkHighlightService, render_highlight_html + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_target(doc_id="doc1", chunk_idx=0, sub_q="What is X?", sub_q_idx=0): + return ChunkHighlightTarget( + document_id=doc_id, + chunk_index=chunk_idx, + sub_question_text=sub_q, + sub_question_index=sub_q_idx, + ) + + +def _make_metadata( + filename="test.pdf", + page_number=1, + chunk_file_path="/chunks/page_1.txt", + document_id="doc1", + chunk_index=0, + sub_question="What is X?", +): + return { + "filename": filename, + "page_number": page_number, + "chunk_file_path": chunk_file_path, + "document_id": document_id, + "chunk_index": chunk_index, + "sub_question": sub_question, + } + + +def _mock_collection(get_return=None): + """Return a MagicMock collection whose .get() returns get_return.""" + collection = MagicMock() + collection.get.return_value = get_return or { + "ids": ["doc1_0"], + "documents": ["This is sentence one. This is sentence two. This is sentence three."], + "metadatas": [_make_metadata()], + } + return collection + + +def _mock_llm(result: HighlightBatchResult): + llm = MagicMock() + llm.complete_structured = AsyncMock(return_value=result) + return llm + + +def _mock_cache(): + """In-memory dict-backed cache mock.""" + store = {} + + cache = MagicMock() + cache.get_highlight.side_effect = lambda key: store.get(key) + cache.set_highlight.side_effect = lambda **kwargs: store.update({kwargs["cache_key"]: kwargs["html_content"]}) + # expose store for assertions + cache._store = store + return cache + + +def _mock_settings(): + s = MagicMock() + s.document_chunk_path = "/data/chunks" + return s + + +def _service(collection=None, llm=None, cache=None, settings=None): + rag = MagicMock() + rag.collection = collection or _mock_collection() + + return ChunkHighlightService( + rag_service=rag, + llm_client=llm or _mock_llm( + HighlightBatchResult( + results=[ + ChunkHighlights( + document_id="doc1", + chunk_index=0, + relevant_sentences=[ + RelevantSentence(sentence_index=0, reason="Defines X"), + RelevantSentence(sentence_index=2, reason="Explains X"), + ], + ) + ] + ) + ), + highlight_cache=cache or _mock_cache(), + settings=settings or _mock_settings(), + ) + + +# =========================================================================== +# compute_highlights_batch tests +# =========================================================================== + + +class TestComputeHighlightsBatch: + """Tests for ChunkHighlightService.compute_highlights_batch.""" + + @pytest.mark.asyncio + async def test_valid_targets_completed(self): + """LLM returns structured result for all targets → status completed.""" + svc = _service() + targets = [_make_target()] + resp = await svc.compute_highlights_batch(targets) + + assert resp.status == "completed" + assert resp.cached_count == 1 + assert resp.errors == [] + + @pytest.mark.asyncio + async def test_empty_targets(self): + """Empty target list → completed with cached_count=0, no LLM call.""" + svc = _service() + resp = await svc.compute_highlights_batch([]) + + assert resp.status == "completed" + assert resp.cached_count == 0 + svc._llm.complete_structured.assert_not_called() + + @pytest.mark.asyncio + async def test_per_target_error_isolation(self): + """One target's ChromaDB fetch fails → other 2 succeed, errors non-empty.""" + collection = MagicMock() + + def side_effect(ids, include): + chunk_id = ids[0] + if chunk_id == "doc1_0": + return { + "ids": [chunk_id], + "documents": ["Sentence A. Sentence B."], + "metadatas": [_make_metadata(document_id="doc1", chunk_index=0)], + } + elif chunk_id == "doc2_0": + raise RuntimeError("ChromaDB boom") + else: + return { + "ids": [chunk_id], + "documents": ["Sentence C. Sentence D."], + "metadatas": [_make_metadata( + document_id="doc3", chunk_index=0, + filename="c.pdf", sub_question="What is C?", + )], + } + + collection.get.side_effect = side_effect + + llm_result = HighlightBatchResult( + results=[ + ChunkHighlights( + document_id="doc1", + chunk_index=0, + relevant_sentences=[RelevantSentence(sentence_index=0, reason="Relevant")], + ), + ChunkHighlights( + document_id="doc3", + chunk_index=0, + relevant_sentences=[RelevantSentence(sentence_index=1, reason="Relevant")], + ), + ] + ) + cache = _mock_cache() + svc = _service(collection=collection, llm=_mock_llm(llm_result), cache=cache) + + targets = [ + _make_target(doc_id="doc1", chunk_idx=0, sub_q="Q1", sub_q_idx=0), + _make_target(doc_id="doc2", chunk_idx=0, sub_q="Q2", sub_q_idx=1), + _make_target(doc_id="doc3", chunk_idx=0, sub_q="Q3", sub_q_idx=2), + ] + resp = await svc.compute_highlights_batch(targets) + + assert resp.cached_count == 2 + assert len(resp.errors) >= 1 + assert resp.status == "partial" + + @pytest.mark.asyncio + async def test_chromadb_miss_skips_target(self): + """collection.get returns empty documents → target skipped.""" + collection = MagicMock() + collection.get.return_value = {"ids": [], "documents": [], "metadatas": []} + + llm = _mock_llm(HighlightBatchResult(results=[])) + cache = _mock_cache() + svc = _service(collection=collection, llm=llm, cache=cache) + + targets = [_make_target()] + resp = await svc.compute_highlights_batch(targets) + + # No chunks fetched → nothing to send to LLM → completed with 0 cached + assert resp.cached_count == 0 + + @pytest.mark.asyncio + async def test_llm_fails_entirely(self): + """LLM call raises → status failed, cached_count=0.""" + llm = MagicMock() + llm.complete_structured = AsyncMock(side_effect=Exception("LLM down")) + + svc = _service(llm=llm) + resp = await svc.compute_highlights_batch([_make_target()]) + + assert resp.status == "failed" + assert resp.cached_count == 0 + assert any("LLM down" in e for e in resp.errors) + + @pytest.mark.asyncio + async def test_llm_returns_results_for_each_target(self): + """LLM returns ChunkHighlights matching each target.""" + collection = MagicMock() + + def side_effect(ids, include): + did = ids[0] + return { + "ids": [did], + "documents": ["Alpha. Beta. Gamma."], + "metadatas": [_make_metadata(document_id=did.split("_")[0], chunk_index=int(did.split("_")[1]))], + } + + collection.get.side_effect = side_effect + + llm_result = HighlightBatchResult( + results=[ + ChunkHighlights(document_id="d1", chunk_index=0, + relevant_sentences=[RelevantSentence(sentence_index=0, reason="R1")]), + ChunkHighlights(document_id="d2", chunk_index=0, + relevant_sentences=[RelevantSentence(sentence_index=1, reason="R2")]), + ] + ) + cache = _mock_cache() + svc = _service(collection=collection, llm=_mock_llm(llm_result), cache=cache) + + targets = [ + _make_target(doc_id="d1", chunk_idx=0, sub_q="SQ1", sub_q_idx=0), + _make_target(doc_id="d2", chunk_idx=0, sub_q="SQ2", sub_q_idx=1), + ] + resp = await svc.compute_highlights_batch(targets) + + assert resp.cached_count == 2 + assert resp.status == "completed" + + @pytest.mark.asyncio + async def test_llm_returns_fewer_results_than_targets_partial(self): + """LLM returns fewer ChunkHighlights than targets → status partial.""" + collection = MagicMock() + + def side_effect(ids, include): + did = ids[0] + return { + "ids": [did], + "documents": ["Alpha. Beta."], + "metadatas": [_make_metadata(document_id=did.split("_")[0], chunk_index=int(did.split("_")[1]))], + } + + collection.get.side_effect = side_effect + + # Only 1 result for 3 targets + llm_result = HighlightBatchResult( + results=[ + ChunkHighlights(document_id="d1", chunk_index=0, + relevant_sentences=[RelevantSentence(sentence_index=0, reason="R1")]), + ] + ) + cache = _mock_cache() + svc = _service(collection=collection, llm=_mock_llm(llm_result), cache=cache) + + targets = [ + _make_target(doc_id="d1", chunk_idx=0, sub_q="SQ1", sub_q_idx=0), + _make_target(doc_id="d2", chunk_idx=0, sub_q="SQ2", sub_q_idx=1), + _make_target(doc_id="d3", chunk_idx=0, sub_q="SQ3", sub_q_idx=2), + ] + resp = await svc.compute_highlights_batch(targets) + + assert resp.status == "partial" + assert resp.cached_count == 1 + + +# =========================================================================== +# render_highlight_html tests +# =========================================================================== + + +class TestRenderHighlightHtml: + """Tests for the render_highlight_html standalone function.""" + + def test_highlighted_sentences_have_yellow_background(self): + """Highlighted sentences should have yellow background CSS.""" + html = render_highlight_html( + chunk_text="Alpha. Beta. Gamma.", + sentences=["Alpha.", "Beta.", "Gamma."], + relevant_sentences=[RelevantSentence(sentence_index=1, reason="Key")], + metadata=_make_metadata(), + ) + assert "highlighted" in html + assert "#fef08a" in html + # "Beta." should be inside highlighted span + assert "Beta." in html + + def test_non_highlighted_no_yellow(self): + """Non-highlighted sentences should NOT have highlighted class.""" + html = render_highlight_html( + chunk_text="Alpha. Beta. Gamma.", + sentences=["Alpha.", "Beta.", "Gamma."], + relevant_sentences=[RelevantSentence(sentence_index=1, reason="Key")], + metadata=_make_metadata(), + ) + # Alpha. and Gamma. should appear but not inside a highlighted span + # Check that Alpha appears in a plaincontext, not inside .highlighted + assert "Alpha." in html + assert "Gamma." in html + # Verify Beta is highlighted but Alpha is not + assert 'class="highlighted"' in html + + def test_reason_text_shown_below_highlighted(self): + """Reason text should appear below each highlighted sentence.""" + html = render_highlight_html( + chunk_text="Alpha. Beta.", + sentences=["Alpha.", "Beta."], + relevant_sentences=[RelevantSentence(sentence_index=0, reason="Defines the concept")], + metadata=_make_metadata(), + ) + assert "reason" in html + assert "Defines the concept" in html + + def test_footer_link_with_chunk_file_path(self): + """When chunk_file_path is present, footer has View Original PDF link.""" + html = render_highlight_html( + chunk_text="Some text.", + sentences=["Some text."], + relevant_sentences=[], + metadata=_make_metadata(chunk_file_path="/chunks/page_1.txt"), + ) + assert "View Original PDF" in html + assert 'href="/api/v1/chunks/view?file=/chunks/page_1.txt"' in html + + def test_no_footer_link_when_chunk_file_path_none(self): + """When chunk_file_path is None, no footer link rendered.""" + html = render_highlight_html( + chunk_text="Some text.", + sentences=["Some text."], + relevant_sentences=[], + metadata=_make_metadata(chunk_file_path=None), + ) + assert "View Original PDF" not in html + + def test_empty_relevant_sentences_all_plain(self): + """Empty relevant_sentences → all text plain, no yellow backgrounds.""" + html = render_highlight_html( + chunk_text="Alpha. Beta.", + sentences=["Alpha.", "Beta."], + relevant_sentences=[], + metadata=_make_metadata(), + ) + assert 'class="highlighted"' not in html + assert "Alpha." in html + assert "Beta." in html + + def test_html_valid_self_contained(self): + """HTML page is valid and self-contained (DOCTYPE, head, body).""" + html = render_highlight_html( + chunk_text="Some text.", + sentences=["Some text."], + relevant_sentences=[], + metadata=_make_metadata(), + ) + assert "" in html + assert "
" in html + assert "" in html + assert "" in html + assert "" in html + assert "charset" in html