feat: add LLM-based batch highlight service and HTML rendering (Phase 5.4.4)
- ChunkHighlightService.compute_highlights_batch(): single LLM call across all cited chunks, grouped by sub-question, with structured output - render_highlight_html(): self-contained HTML page with yellow-highlighted relevant sentences, LLM reason annotations, and View Original PDF footer - Per-target error isolation, ChromaDB miss handling, graceful degradation - 14 tests: 7 batch service + 7 HTML rendering
This commit is contained in:
parent
bdbc8ea1a0
commit
c6d4a38013
|
|
@ -0,0 +1,231 @@
|
||||||
|
"""Chunk highlight service — batch LLM highlight computation and HTML rendering."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from collections import defaultdict
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from app.models.highlight import (
|
||||||
|
ChunkHighlightTarget,
|
||||||
|
ChunkHighlights,
|
||||||
|
HighlightBatchResponse,
|
||||||
|
HighlightBatchResult,
|
||||||
|
RelevantSentence,
|
||||||
|
)
|
||||||
|
from app.services.highlight_cache import compute_cache_key
|
||||||
|
from app.utils.sentence_splitter import split_sentences
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def render_highlight_html(
|
||||||
|
chunk_text: str,
|
||||||
|
sentences: list[str],
|
||||||
|
relevant_sentences: list[RelevantSentence],
|
||||||
|
metadata: dict[str, Any],
|
||||||
|
) -> str:
|
||||||
|
highlighted_indices = {rs.sentence_index for rs in relevant_sentences}
|
||||||
|
index_to_reason: dict[int, str] = {rs.sentence_index: rs.reason for rs in relevant_sentences}
|
||||||
|
|
||||||
|
filename = metadata.get("filename", "Unknown")
|
||||||
|
page_number = metadata.get("page_number")
|
||||||
|
chunk_file_path = metadata.get("chunk_file_path")
|
||||||
|
sub_question = metadata.get("sub_question", "")
|
||||||
|
chunk_index = metadata.get("chunk_index", 0)
|
||||||
|
|
||||||
|
parts: list[str] = []
|
||||||
|
parts.append("<!DOCTYPE html>")
|
||||||
|
parts.append("<html>")
|
||||||
|
parts.append("<head>")
|
||||||
|
parts.append('<meta charset="utf-8">')
|
||||||
|
parts.append("<style>")
|
||||||
|
parts.append("body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; margin: 24px; color: #1e293b; line-height: 1.7; }")
|
||||||
|
parts.append(".highlighted { background-color: #fef08a; padding: 2px 4px; border-radius: 3px; }")
|
||||||
|
parts.append(".reason { color: #6b7280; font-style: italic; font-size: 0.9em; margin-left: 12px; }")
|
||||||
|
parts.append(".header { margin-bottom: 16px; }")
|
||||||
|
parts.append(".sub-header { color: #475569; font-size: 0.95em; margin: 4px 0; }")
|
||||||
|
parts.append(".sentence { margin: 8px 0; }")
|
||||||
|
parts.append(".footer { margin-top: 24px; padding-top: 12px; border-top: 1px solid #e2e8f0; }")
|
||||||
|
parts.append(".footer a { color: #2563eb; text-decoration: none; }")
|
||||||
|
parts.append(".footer a:hover { text-decoration: underline; }")
|
||||||
|
parts.append("</style>")
|
||||||
|
parts.append("</head>")
|
||||||
|
parts.append("<body>")
|
||||||
|
|
||||||
|
parts.append('<div class="header">')
|
||||||
|
parts.append(f"<h2>{filename} — Chunk {chunk_index}</h2>")
|
||||||
|
if page_number is not None:
|
||||||
|
parts.append(f'<p class="sub-header">Page {page_number}</p>')
|
||||||
|
if sub_question:
|
||||||
|
parts.append(f'<p class="sub-header">Sub-question: {sub_question}</p>')
|
||||||
|
parts.append("</div>")
|
||||||
|
|
||||||
|
for i, sentence in enumerate(sentences):
|
||||||
|
if i in highlighted_indices:
|
||||||
|
reason = index_to_reason.get(i, "")
|
||||||
|
parts.append('<div class="sentence">')
|
||||||
|
parts.append(f'<span class="highlighted">{sentence}</span>')
|
||||||
|
if reason:
|
||||||
|
parts.append(f'<span class="reason">{reason}</span>')
|
||||||
|
parts.append("</div>")
|
||||||
|
else:
|
||||||
|
parts.append(f'<p class="sentence">{sentence}</p>')
|
||||||
|
|
||||||
|
if chunk_file_path:
|
||||||
|
parts.append('<div class="footer">')
|
||||||
|
parts.append(f'<a href="/api/v1/chunks/view?file={chunk_file_path}">View Original PDF →</a>')
|
||||||
|
parts.append("</div>")
|
||||||
|
|
||||||
|
parts.append("</body>")
|
||||||
|
parts.append("</html>")
|
||||||
|
|
||||||
|
return "\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
class ChunkHighlightService:
|
||||||
|
|
||||||
|
def __init__(self, rag_service, llm_client, highlight_cache, settings):
|
||||||
|
self._rag = rag_service
|
||||||
|
self._llm = llm_client
|
||||||
|
self._cache = highlight_cache
|
||||||
|
self._settings = settings
|
||||||
|
|
||||||
|
async def compute_highlights_batch(
|
||||||
|
self,
|
||||||
|
targets: list[ChunkHighlightTarget],
|
||||||
|
) -> HighlightBatchResponse:
|
||||||
|
if not targets:
|
||||||
|
return HighlightBatchResponse(status="completed", cached_count=0)
|
||||||
|
|
||||||
|
errors: list[str] = []
|
||||||
|
fetched: list[tuple[ChunkHighlightTarget, list[str], dict[str, Any]]] = []
|
||||||
|
|
||||||
|
for target in targets:
|
||||||
|
chunk_id = f"{target.document_id}_{target.chunk_index}"
|
||||||
|
try:
|
||||||
|
result = self._rag.collection.get(
|
||||||
|
ids=[chunk_id],
|
||||||
|
include=["documents", "metadatas"],
|
||||||
|
)
|
||||||
|
if not result.get("documents"):
|
||||||
|
logger.warning("No documents returned for chunk_id=%s, skipping.", chunk_id)
|
||||||
|
continue
|
||||||
|
chunk_text = result["documents"][0]
|
||||||
|
metadata = result["metadatas"][0] if result.get("metadatas") else {}
|
||||||
|
metadata["sub_question"] = target.sub_question_text
|
||||||
|
metadata["chunk_index"] = target.chunk_index
|
||||||
|
sentences = split_sentences(chunk_text)
|
||||||
|
fetched.append((target, sentences, metadata))
|
||||||
|
except Exception as exc:
|
||||||
|
msg = f"Failed to fetch chunk {chunk_id}: {exc}"
|
||||||
|
logger.error(msg)
|
||||||
|
errors.append(msg)
|
||||||
|
|
||||||
|
if not fetched:
|
||||||
|
return HighlightBatchResponse(
|
||||||
|
status="completed" if not errors else "partial",
|
||||||
|
cached_count=0,
|
||||||
|
errors=errors,
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt = self._build_prompt(fetched)
|
||||||
|
|
||||||
|
try:
|
||||||
|
llm_result: HighlightBatchResult = await self._llm.complete_structured(
|
||||||
|
prompt, HighlightBatchResult, step_name="HighlightBatch"
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error("HighlightBatch LLM call failed: %s", exc)
|
||||||
|
return HighlightBatchResponse(
|
||||||
|
status="failed", cached_count=0, errors=[str(exc)]
|
||||||
|
)
|
||||||
|
|
||||||
|
cached_count = self._cache_results(fetched, llm_result)
|
||||||
|
|
||||||
|
result_ids = {(r.document_id, r.chunk_index) for r in llm_result.results}
|
||||||
|
fetched_ids = {(t.document_id, t.chunk_index) for t, _, _ in fetched}
|
||||||
|
missing = fetched_ids - result_ids
|
||||||
|
|
||||||
|
if errors or missing:
|
||||||
|
for doc_id, chunk_idx in missing:
|
||||||
|
errors.append(f"No highlight result for {doc_id}_{chunk_idx}")
|
||||||
|
|
||||||
|
status = "partial" if (errors or missing) else "completed"
|
||||||
|
return HighlightBatchResponse(
|
||||||
|
status=status,
|
||||||
|
cached_count=cached_count,
|
||||||
|
errors=errors,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _build_prompt(
|
||||||
|
self,
|
||||||
|
fetched: list[tuple[ChunkHighlightTarget, list[str], dict[str, Any]]],
|
||||||
|
) -> str:
|
||||||
|
by_sub_q: dict[int, list[tuple[ChunkHighlightTarget, list[str]]]] = defaultdict(list)
|
||||||
|
for target, sentences, _meta in fetched:
|
||||||
|
by_sub_q[target.sub_question_index].append((target, sentences))
|
||||||
|
|
||||||
|
lines: list[str] = [
|
||||||
|
"For each sub-question below, identify which sentences in each cited chunk are relevant to answering that sub-question.",
|
||||||
|
"Return a HighlightBatchResult with a results list containing one ChunkHighlights per (document_id, chunk_index) pair.",
|
||||||
|
"Each ChunkHighlights should list the relevant sentence indices (0-based) with a brief reason (max 80 chars).",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
|
||||||
|
for sq_idx in sorted(by_sub_q.keys()):
|
||||||
|
items = by_sub_q[sq_idx]
|
||||||
|
sub_q_text = items[0][0].sub_question_text
|
||||||
|
lines.append(f"## Sub-question {sq_idx}: {sub_q_text}")
|
||||||
|
lines.append("")
|
||||||
|
for target, sentences in items:
|
||||||
|
lines.append(f"### Chunk: document_id={target.document_id}, chunk_index={target.chunk_index}")
|
||||||
|
for i, s in enumerate(sentences):
|
||||||
|
lines.append(f"[{i}] {s}")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
def _cache_results(
|
||||||
|
self,
|
||||||
|
fetched: list[tuple[ChunkHighlightTarget, list[str], dict[str, Any]]],
|
||||||
|
llm_result: HighlightBatchResult,
|
||||||
|
) -> int:
|
||||||
|
lookup: dict[tuple[str, int], tuple[ChunkHighlightTarget, list[str], dict[str, Any]]] = {
|
||||||
|
(t.document_id, t.chunk_index): (t, s, m)
|
||||||
|
for t, s, m in fetched
|
||||||
|
}
|
||||||
|
|
||||||
|
cached_count = 0
|
||||||
|
for chunk_hl in llm_result.results:
|
||||||
|
key = (chunk_hl.document_id, chunk_hl.chunk_index)
|
||||||
|
entry = lookup.get(key)
|
||||||
|
if entry is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
target, sentences, metadata = entry
|
||||||
|
html = render_highlight_html(
|
||||||
|
chunk_text=" ".join(sentences),
|
||||||
|
sentences=sentences,
|
||||||
|
relevant_sentences=chunk_hl.relevant_sentences,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
|
||||||
|
cache_key = compute_cache_key(
|
||||||
|
target.document_id,
|
||||||
|
target.chunk_index,
|
||||||
|
target.sub_question_text,
|
||||||
|
)
|
||||||
|
self._cache.set_highlight(
|
||||||
|
cache_key=cache_key,
|
||||||
|
document_id=target.document_id,
|
||||||
|
chunk_index=target.chunk_index,
|
||||||
|
sub_question=target.sub_question_text,
|
||||||
|
relevant_sentences_json=json.dumps(
|
||||||
|
[rs.model_dump() for rs in chunk_hl.relevant_sentences],
|
||||||
|
default=str,
|
||||||
|
),
|
||||||
|
html_content=html,
|
||||||
|
)
|
||||||
|
cached_count += 1
|
||||||
|
|
||||||
|
return cached_count
|
||||||
|
|
@ -0,0 +1,391 @@
|
||||||
|
"""Tests for ChunkHighlightService — compute_highlights_batch and render_highlight_html.
|
||||||
|
|
||||||
|
Coverage:
|
||||||
|
- compute_highlights_batch: valid targets, empty targets, per-target error isolation,
|
||||||
|
ChromaDB miss, LLM failure, partial results
|
||||||
|
- render_highlight_html: highlighted sentences, non-highlighted sentences, reason text,
|
||||||
|
footer link, no footer link, empty relevant_sentences, valid HTML structure
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
from unittest.mock import AsyncMock, MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.models.highlight import (
|
||||||
|
ChunkHighlightTarget,
|
||||||
|
ChunkHighlights,
|
||||||
|
HighlightBatchResult,
|
||||||
|
RelevantSentence,
|
||||||
|
)
|
||||||
|
from app.services.chunk_highlight_service import ChunkHighlightService, render_highlight_html
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _make_target(doc_id="doc1", chunk_idx=0, sub_q="What is X?", sub_q_idx=0):
|
||||||
|
return ChunkHighlightTarget(
|
||||||
|
document_id=doc_id,
|
||||||
|
chunk_index=chunk_idx,
|
||||||
|
sub_question_text=sub_q,
|
||||||
|
sub_question_index=sub_q_idx,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_metadata(
|
||||||
|
filename="test.pdf",
|
||||||
|
page_number=1,
|
||||||
|
chunk_file_path="/chunks/page_1.txt",
|
||||||
|
document_id="doc1",
|
||||||
|
chunk_index=0,
|
||||||
|
sub_question="What is X?",
|
||||||
|
):
|
||||||
|
return {
|
||||||
|
"filename": filename,
|
||||||
|
"page_number": page_number,
|
||||||
|
"chunk_file_path": chunk_file_path,
|
||||||
|
"document_id": document_id,
|
||||||
|
"chunk_index": chunk_index,
|
||||||
|
"sub_question": sub_question,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _mock_collection(get_return=None):
|
||||||
|
"""Return a MagicMock collection whose .get() returns get_return."""
|
||||||
|
collection = MagicMock()
|
||||||
|
collection.get.return_value = get_return or {
|
||||||
|
"ids": ["doc1_0"],
|
||||||
|
"documents": ["This is sentence one. This is sentence two. This is sentence three."],
|
||||||
|
"metadatas": [_make_metadata()],
|
||||||
|
}
|
||||||
|
return collection
|
||||||
|
|
||||||
|
|
||||||
|
def _mock_llm(result: HighlightBatchResult):
|
||||||
|
llm = MagicMock()
|
||||||
|
llm.complete_structured = AsyncMock(return_value=result)
|
||||||
|
return llm
|
||||||
|
|
||||||
|
|
||||||
|
def _mock_cache():
|
||||||
|
"""In-memory dict-backed cache mock."""
|
||||||
|
store = {}
|
||||||
|
|
||||||
|
cache = MagicMock()
|
||||||
|
cache.get_highlight.side_effect = lambda key: store.get(key)
|
||||||
|
cache.set_highlight.side_effect = lambda **kwargs: store.update({kwargs["cache_key"]: kwargs["html_content"]})
|
||||||
|
# expose store for assertions
|
||||||
|
cache._store = store
|
||||||
|
return cache
|
||||||
|
|
||||||
|
|
||||||
|
def _mock_settings():
|
||||||
|
s = MagicMock()
|
||||||
|
s.document_chunk_path = "/data/chunks"
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def _service(collection=None, llm=None, cache=None, settings=None):
|
||||||
|
rag = MagicMock()
|
||||||
|
rag.collection = collection or _mock_collection()
|
||||||
|
|
||||||
|
return ChunkHighlightService(
|
||||||
|
rag_service=rag,
|
||||||
|
llm_client=llm or _mock_llm(
|
||||||
|
HighlightBatchResult(
|
||||||
|
results=[
|
||||||
|
ChunkHighlights(
|
||||||
|
document_id="doc1",
|
||||||
|
chunk_index=0,
|
||||||
|
relevant_sentences=[
|
||||||
|
RelevantSentence(sentence_index=0, reason="Defines X"),
|
||||||
|
RelevantSentence(sentence_index=2, reason="Explains X"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
highlight_cache=cache or _mock_cache(),
|
||||||
|
settings=settings or _mock_settings(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# compute_highlights_batch tests
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TestComputeHighlightsBatch:
|
||||||
|
"""Tests for ChunkHighlightService.compute_highlights_batch."""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_valid_targets_completed(self):
|
||||||
|
"""LLM returns structured result for all targets → status completed."""
|
||||||
|
svc = _service()
|
||||||
|
targets = [_make_target()]
|
||||||
|
resp = await svc.compute_highlights_batch(targets)
|
||||||
|
|
||||||
|
assert resp.status == "completed"
|
||||||
|
assert resp.cached_count == 1
|
||||||
|
assert resp.errors == []
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_empty_targets(self):
|
||||||
|
"""Empty target list → completed with cached_count=0, no LLM call."""
|
||||||
|
svc = _service()
|
||||||
|
resp = await svc.compute_highlights_batch([])
|
||||||
|
|
||||||
|
assert resp.status == "completed"
|
||||||
|
assert resp.cached_count == 0
|
||||||
|
svc._llm.complete_structured.assert_not_called()
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_per_target_error_isolation(self):
|
||||||
|
"""One target's ChromaDB fetch fails → other 2 succeed, errors non-empty."""
|
||||||
|
collection = MagicMock()
|
||||||
|
|
||||||
|
def side_effect(ids, include):
|
||||||
|
chunk_id = ids[0]
|
||||||
|
if chunk_id == "doc1_0":
|
||||||
|
return {
|
||||||
|
"ids": [chunk_id],
|
||||||
|
"documents": ["Sentence A. Sentence B."],
|
||||||
|
"metadatas": [_make_metadata(document_id="doc1", chunk_index=0)],
|
||||||
|
}
|
||||||
|
elif chunk_id == "doc2_0":
|
||||||
|
raise RuntimeError("ChromaDB boom")
|
||||||
|
else:
|
||||||
|
return {
|
||||||
|
"ids": [chunk_id],
|
||||||
|
"documents": ["Sentence C. Sentence D."],
|
||||||
|
"metadatas": [_make_metadata(
|
||||||
|
document_id="doc3", chunk_index=0,
|
||||||
|
filename="c.pdf", sub_question="What is C?",
|
||||||
|
)],
|
||||||
|
}
|
||||||
|
|
||||||
|
collection.get.side_effect = side_effect
|
||||||
|
|
||||||
|
llm_result = HighlightBatchResult(
|
||||||
|
results=[
|
||||||
|
ChunkHighlights(
|
||||||
|
document_id="doc1",
|
||||||
|
chunk_index=0,
|
||||||
|
relevant_sentences=[RelevantSentence(sentence_index=0, reason="Relevant")],
|
||||||
|
),
|
||||||
|
ChunkHighlights(
|
||||||
|
document_id="doc3",
|
||||||
|
chunk_index=0,
|
||||||
|
relevant_sentences=[RelevantSentence(sentence_index=1, reason="Relevant")],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
cache = _mock_cache()
|
||||||
|
svc = _service(collection=collection, llm=_mock_llm(llm_result), cache=cache)
|
||||||
|
|
||||||
|
targets = [
|
||||||
|
_make_target(doc_id="doc1", chunk_idx=0, sub_q="Q1", sub_q_idx=0),
|
||||||
|
_make_target(doc_id="doc2", chunk_idx=0, sub_q="Q2", sub_q_idx=1),
|
||||||
|
_make_target(doc_id="doc3", chunk_idx=0, sub_q="Q3", sub_q_idx=2),
|
||||||
|
]
|
||||||
|
resp = await svc.compute_highlights_batch(targets)
|
||||||
|
|
||||||
|
assert resp.cached_count == 2
|
||||||
|
assert len(resp.errors) >= 1
|
||||||
|
assert resp.status == "partial"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_chromadb_miss_skips_target(self):
|
||||||
|
"""collection.get returns empty documents → target skipped."""
|
||||||
|
collection = MagicMock()
|
||||||
|
collection.get.return_value = {"ids": [], "documents": [], "metadatas": []}
|
||||||
|
|
||||||
|
llm = _mock_llm(HighlightBatchResult(results=[]))
|
||||||
|
cache = _mock_cache()
|
||||||
|
svc = _service(collection=collection, llm=llm, cache=cache)
|
||||||
|
|
||||||
|
targets = [_make_target()]
|
||||||
|
resp = await svc.compute_highlights_batch(targets)
|
||||||
|
|
||||||
|
# No chunks fetched → nothing to send to LLM → completed with 0 cached
|
||||||
|
assert resp.cached_count == 0
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_llm_fails_entirely(self):
|
||||||
|
"""LLM call raises → status failed, cached_count=0."""
|
||||||
|
llm = MagicMock()
|
||||||
|
llm.complete_structured = AsyncMock(side_effect=Exception("LLM down"))
|
||||||
|
|
||||||
|
svc = _service(llm=llm)
|
||||||
|
resp = await svc.compute_highlights_batch([_make_target()])
|
||||||
|
|
||||||
|
assert resp.status == "failed"
|
||||||
|
assert resp.cached_count == 0
|
||||||
|
assert any("LLM down" in e for e in resp.errors)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_llm_returns_results_for_each_target(self):
|
||||||
|
"""LLM returns ChunkHighlights matching each target."""
|
||||||
|
collection = MagicMock()
|
||||||
|
|
||||||
|
def side_effect(ids, include):
|
||||||
|
did = ids[0]
|
||||||
|
return {
|
||||||
|
"ids": [did],
|
||||||
|
"documents": ["Alpha. Beta. Gamma."],
|
||||||
|
"metadatas": [_make_metadata(document_id=did.split("_")[0], chunk_index=int(did.split("_")[1]))],
|
||||||
|
}
|
||||||
|
|
||||||
|
collection.get.side_effect = side_effect
|
||||||
|
|
||||||
|
llm_result = HighlightBatchResult(
|
||||||
|
results=[
|
||||||
|
ChunkHighlights(document_id="d1", chunk_index=0,
|
||||||
|
relevant_sentences=[RelevantSentence(sentence_index=0, reason="R1")]),
|
||||||
|
ChunkHighlights(document_id="d2", chunk_index=0,
|
||||||
|
relevant_sentences=[RelevantSentence(sentence_index=1, reason="R2")]),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
cache = _mock_cache()
|
||||||
|
svc = _service(collection=collection, llm=_mock_llm(llm_result), cache=cache)
|
||||||
|
|
||||||
|
targets = [
|
||||||
|
_make_target(doc_id="d1", chunk_idx=0, sub_q="SQ1", sub_q_idx=0),
|
||||||
|
_make_target(doc_id="d2", chunk_idx=0, sub_q="SQ2", sub_q_idx=1),
|
||||||
|
]
|
||||||
|
resp = await svc.compute_highlights_batch(targets)
|
||||||
|
|
||||||
|
assert resp.cached_count == 2
|
||||||
|
assert resp.status == "completed"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_llm_returns_fewer_results_than_targets_partial(self):
|
||||||
|
"""LLM returns fewer ChunkHighlights than targets → status partial."""
|
||||||
|
collection = MagicMock()
|
||||||
|
|
||||||
|
def side_effect(ids, include):
|
||||||
|
did = ids[0]
|
||||||
|
return {
|
||||||
|
"ids": [did],
|
||||||
|
"documents": ["Alpha. Beta."],
|
||||||
|
"metadatas": [_make_metadata(document_id=did.split("_")[0], chunk_index=int(did.split("_")[1]))],
|
||||||
|
}
|
||||||
|
|
||||||
|
collection.get.side_effect = side_effect
|
||||||
|
|
||||||
|
# Only 1 result for 3 targets
|
||||||
|
llm_result = HighlightBatchResult(
|
||||||
|
results=[
|
||||||
|
ChunkHighlights(document_id="d1", chunk_index=0,
|
||||||
|
relevant_sentences=[RelevantSentence(sentence_index=0, reason="R1")]),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
cache = _mock_cache()
|
||||||
|
svc = _service(collection=collection, llm=_mock_llm(llm_result), cache=cache)
|
||||||
|
|
||||||
|
targets = [
|
||||||
|
_make_target(doc_id="d1", chunk_idx=0, sub_q="SQ1", sub_q_idx=0),
|
||||||
|
_make_target(doc_id="d2", chunk_idx=0, sub_q="SQ2", sub_q_idx=1),
|
||||||
|
_make_target(doc_id="d3", chunk_idx=0, sub_q="SQ3", sub_q_idx=2),
|
||||||
|
]
|
||||||
|
resp = await svc.compute_highlights_batch(targets)
|
||||||
|
|
||||||
|
assert resp.status == "partial"
|
||||||
|
assert resp.cached_count == 1
|
||||||
|
|
||||||
|
|
||||||
|
# ===========================================================================
|
||||||
|
# render_highlight_html tests
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TestRenderHighlightHtml:
|
||||||
|
"""Tests for the render_highlight_html standalone function."""
|
||||||
|
|
||||||
|
def test_highlighted_sentences_have_yellow_background(self):
|
||||||
|
"""Highlighted sentences should have yellow background CSS."""
|
||||||
|
html = render_highlight_html(
|
||||||
|
chunk_text="Alpha. Beta. Gamma.",
|
||||||
|
sentences=["Alpha.", "Beta.", "Gamma."],
|
||||||
|
relevant_sentences=[RelevantSentence(sentence_index=1, reason="Key")],
|
||||||
|
metadata=_make_metadata(),
|
||||||
|
)
|
||||||
|
assert "highlighted" in html
|
||||||
|
assert "#fef08a" in html
|
||||||
|
# "Beta." should be inside highlighted span
|
||||||
|
assert "Beta." in html
|
||||||
|
|
||||||
|
def test_non_highlighted_no_yellow(self):
|
||||||
|
"""Non-highlighted sentences should NOT have highlighted class."""
|
||||||
|
html = render_highlight_html(
|
||||||
|
chunk_text="Alpha. Beta. Gamma.",
|
||||||
|
sentences=["Alpha.", "Beta.", "Gamma."],
|
||||||
|
relevant_sentences=[RelevantSentence(sentence_index=1, reason="Key")],
|
||||||
|
metadata=_make_metadata(),
|
||||||
|
)
|
||||||
|
# Alpha. and Gamma. should appear but not inside a highlighted span
|
||||||
|
# Check that Alpha appears in a plain <p> context, not inside .highlighted
|
||||||
|
assert "Alpha." in html
|
||||||
|
assert "Gamma." in html
|
||||||
|
# Verify Beta is highlighted but Alpha is not
|
||||||
|
assert 'class="highlighted"' in html
|
||||||
|
|
||||||
|
def test_reason_text_shown_below_highlighted(self):
|
||||||
|
"""Reason text should appear below each highlighted sentence."""
|
||||||
|
html = render_highlight_html(
|
||||||
|
chunk_text="Alpha. Beta.",
|
||||||
|
sentences=["Alpha.", "Beta."],
|
||||||
|
relevant_sentences=[RelevantSentence(sentence_index=0, reason="Defines the concept")],
|
||||||
|
metadata=_make_metadata(),
|
||||||
|
)
|
||||||
|
assert "reason" in html
|
||||||
|
assert "Defines the concept" in html
|
||||||
|
|
||||||
|
def test_footer_link_with_chunk_file_path(self):
|
||||||
|
"""When chunk_file_path is present, footer has View Original PDF link."""
|
||||||
|
html = render_highlight_html(
|
||||||
|
chunk_text="Some text.",
|
||||||
|
sentences=["Some text."],
|
||||||
|
relevant_sentences=[],
|
||||||
|
metadata=_make_metadata(chunk_file_path="/chunks/page_1.txt"),
|
||||||
|
)
|
||||||
|
assert "View Original PDF" in html
|
||||||
|
assert 'href="/api/v1/chunks/view?file=/chunks/page_1.txt"' in html
|
||||||
|
|
||||||
|
def test_no_footer_link_when_chunk_file_path_none(self):
|
||||||
|
"""When chunk_file_path is None, no footer link rendered."""
|
||||||
|
html = render_highlight_html(
|
||||||
|
chunk_text="Some text.",
|
||||||
|
sentences=["Some text."],
|
||||||
|
relevant_sentences=[],
|
||||||
|
metadata=_make_metadata(chunk_file_path=None),
|
||||||
|
)
|
||||||
|
assert "View Original PDF" not in html
|
||||||
|
|
||||||
|
def test_empty_relevant_sentences_all_plain(self):
|
||||||
|
"""Empty relevant_sentences → all text plain, no yellow backgrounds."""
|
||||||
|
html = render_highlight_html(
|
||||||
|
chunk_text="Alpha. Beta.",
|
||||||
|
sentences=["Alpha.", "Beta."],
|
||||||
|
relevant_sentences=[],
|
||||||
|
metadata=_make_metadata(),
|
||||||
|
)
|
||||||
|
assert 'class="highlighted"' not in html
|
||||||
|
assert "Alpha." in html
|
||||||
|
assert "Beta." in html
|
||||||
|
|
||||||
|
def test_html_valid_self_contained(self):
|
||||||
|
"""HTML page is valid and self-contained (DOCTYPE, head, body)."""
|
||||||
|
html = render_highlight_html(
|
||||||
|
chunk_text="Some text.",
|
||||||
|
sentences=["Some text."],
|
||||||
|
relevant_sentences=[],
|
||||||
|
metadata=_make_metadata(),
|
||||||
|
)
|
||||||
|
assert "<!DOCTYPE html>" in html
|
||||||
|
assert "<head>" in html
|
||||||
|
assert "</head>" in html
|
||||||
|
assert "<body>" in html
|
||||||
|
assert "</body>" in html
|
||||||
|
assert "charset" in html
|
||||||
Loading…
Reference in New Issue