feat: add SQLite highlight cache service (Phase 5.4.3)

- highlight_cache.py: HighlightCache class with get/set_highlight and
  compute_cache_key (sha256 hash of document_id|chunk_index|sub_question)
- INSERT OR REPLACE semantics, idempotent table creation
- 13 tests covering round-trip, overwrite, missing keys, determinism
This commit is contained in:
Woody 2026-04-29 09:26:20 +08:00
parent b11d31e2d1
commit bdbc8ea1a0
2 changed files with 316 additions and 0 deletions

View File

@ -0,0 +1,94 @@
"""Highlight result cache.
Stores pre-computed highlighted chunk HTML pages in SQLite for instant retrieval.
Uses sync sqlite3 all operations are instant local reads/writes.
Each method opens its own connection.
"""
import hashlib
import json
import logging
import sqlite3
logger = logging.getLogger(__name__)
def _connect(db_path: str) -> sqlite3.Connection:
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
return conn
def compute_cache_key(document_id: str, chunk_index: int, sub_question: str) -> str:
"""Deterministic cache key: sha256 hash of (document_id, chunk_index, sub_question)."""
raw = f"{document_id}|{chunk_index}|{sub_question}"
return hashlib.sha256(raw.encode("utf-8")).hexdigest()
class HighlightCache:
def __init__(self, db_path: str) -> None:
self._db_path = db_path
self._init_table()
def _init_table(self) -> None:
"""Create table if not exists (idempotent)."""
with _connect(self._db_path) as conn:
conn.execute(
"""
CREATE TABLE IF NOT EXISTS chunk_highlights (
id INTEGER PRIMARY KEY AUTOINCREMENT,
cache_key TEXT UNIQUE NOT NULL,
document_id TEXT NOT NULL,
chunk_index INTEGER NOT NULL,
sub_question TEXT NOT NULL,
relevant_sentences_json TEXT NOT NULL,
html_content TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
"""
)
conn.execute(
"""
CREATE INDEX IF NOT EXISTS idx_highlights_cache_key
ON chunk_highlights(cache_key)
"""
)
conn.commit()
def get_highlight(self, cache_key: str) -> str | None:
"""Retrieve cached HTML content by cache key. Returns None if not found."""
with _connect(self._db_path) as conn:
row = conn.execute(
"SELECT html_content FROM chunk_highlights WHERE cache_key = ?",
(cache_key,),
).fetchone()
if row is None:
return None
return row["html_content"]
def set_highlight(
self,
cache_key: str,
document_id: str,
chunk_index: int,
sub_question: str,
relevant_sentences_json: str,
html_content: str,
) -> None:
"""Store highlighted HTML in cache. Overwrites existing entry."""
with _connect(self._db_path) as conn:
conn.execute(
"""INSERT OR REPLACE INTO chunk_highlights
(cache_key, document_id, chunk_index, sub_question,
relevant_sentences_json, html_content)
VALUES (?, ?, ?, ?, ?, ?)""",
(
cache_key,
document_id,
chunk_index,
sub_question,
relevant_sentences_json,
html_content,
),
)
conn.commit()

View File

@ -0,0 +1,222 @@
"""Tests for Phase 5.4 HighlightCache — SQLite cache for highlighted chunk HTML.
Covers:
- set_highlight stores HTML and relevant_sentences_json
- get_highlight retrieves cached HTML by cache key
- get_highlight returns None for missing cache key
- set_highlight overwrites existing entry (INSERT OR REPLACE)
- compute_cache_key produces same key for same inputs, different for different sub_questions
- compute_cache_key is deterministic (same inputs same hash)
- get_highlight after set_highlight returns exact HTML content (no corruption)
- table creation is idempotent (init twice doesn't crash)
- multiple instances on same DB file share data correctly
Uses tmp_path for isolated test databases no real filesystem pollution.
"""
import json
import pytest
from app.services.highlight_cache import HighlightCache, compute_cache_key
# ── compute_cache_key ──────────────────────────────────────────────────────
def test_compute_cache_key_same_inputs_same_hash():
key1 = compute_cache_key("doc1", 0, "What is the budget?")
key2 = compute_cache_key("doc1", 0, "What is the budget?")
assert key1 == key2
def test_compute_cache_key_different_sub_question_different_hash():
key1 = compute_cache_key("doc1", 0, "What is the budget?")
key2 = compute_cache_key("doc1", 0, "Who proposed it?")
assert key1 != key2
def test_compute_cache_key_different_document_id_different_hash():
key1 = compute_cache_key("doc1", 0, "What is the budget?")
key2 = compute_cache_key("doc2", 0, "What is the budget?")
assert key1 != key2
def test_compute_cache_key_different_chunk_index_different_hash():
key1 = compute_cache_key("doc1", 0, "What is the budget?")
key2 = compute_cache_key("doc1", 1, "What is the budget?")
assert key1 != key2
def test_compute_cache_key_is_64_char_hex():
key = compute_cache_key("doc1", 0, "What is the budget?")
assert len(key) == 64
assert all(c in "0123456789abcdef" for c in key)
# ── HighlightCache basic CRUD ──────────────────────────────────────────────
def test_set_and_get_highlight(tmp_path):
db_path = str(tmp_path / "highlights.db")
cache = HighlightCache(db_path)
cache.set_highlight(
cache_key="abc123",
document_id="doc1",
chunk_index=0,
sub_question="What is the budget?",
relevant_sentences_json=json.dumps([0, 1, 2]),
html_content="<p>highlighted</p>",
)
result = cache.get_highlight("abc123")
assert result == "<p>highlighted</p>"
def test_get_highlight_missing_returns_none(tmp_path):
db_path = str(tmp_path / "highlights.db")
cache = HighlightCache(db_path)
result = cache.get_highlight("nonexistent")
assert result is None
def test_set_highlight_overwrites_existing(tmp_path):
db_path = str(tmp_path / "highlights.db")
cache = HighlightCache(db_path)
cache.set_highlight(
cache_key="abc123",
document_id="doc1",
chunk_index=0,
sub_question="What is the budget?",
relevant_sentences_json=json.dumps([0, 1]),
html_content="<p>first</p>",
)
cache.set_highlight(
cache_key="abc123",
document_id="doc1",
chunk_index=0,
sub_question="What is the budget?",
relevant_sentences_json=json.dumps([2, 3]),
html_content="<p>second</p>",
)
result = cache.get_highlight("abc123")
assert result == "<p>second</p>"
def test_get_highlight_returns_exact_html_no_corruption(tmp_path):
db_path = str(tmp_path / "highlights.db")
cache = HighlightCache(db_path)
html = (
'<div class="chunk">\n'
' <p>Line one</p>\n'
' <mark>Line two</mark>\n'
' <p>Line three</p>\n'
'</div>'
)
cache.set_highlight(
cache_key="key1",
document_id="doc1",
chunk_index=0,
sub_question="Q?",
relevant_sentences_json=json.dumps([1]),
html_content=html,
)
result = cache.get_highlight("key1")
assert result == html
# ── Table init idempotency ─────────────────────────────────────────────────
def test_init_table_is_idempotent(tmp_path):
db_path = str(tmp_path / "highlights.db")
cache1 = HighlightCache(db_path)
cache1.set_highlight(
cache_key="k1",
document_id="d1",
chunk_index=0,
sub_question="Q1",
relevant_sentences_json="[]",
html_content="<p>hi</p>",
)
# Second init on same DB should not crash
cache2 = HighlightCache(db_path)
assert cache2.get_highlight("k1") == "<p>hi</p>"
# ── Multiple instances share data ──────────────────────────────────────────
def test_multiple_instances_share_same_db(tmp_path):
db_path = str(tmp_path / "highlights.db")
cache1 = HighlightCache(db_path)
cache2 = HighlightCache(db_path)
cache1.set_highlight(
cache_key="shared",
document_id="doc1",
chunk_index=0,
sub_question="Q?",
relevant_sentences_json="[]",
html_content="<p>shared</p>",
)
assert cache2.get_highlight("shared") == "<p>shared</p>"
# ── compute_cache_key integration with cache ───────────────────────────────
def test_compute_cache_key_and_round_trip(tmp_path):
db_path = str(tmp_path / "highlights.db")
cache = HighlightCache(db_path)
key = compute_cache_key("doc_42", 7, "What was the total spending?")
cache.set_highlight(
cache_key=key,
document_id="doc_42",
chunk_index=7,
sub_question="What was the total spending?",
relevant_sentences_json=json.dumps([3, 4]),
html_content="<mark>total</mark>",
)
assert cache.get_highlight(key) == "<mark>total</mark>"
def test_different_sub_questions_produce_different_cache_entries(tmp_path):
db_path = str(tmp_path / "highlights.db")
cache = HighlightCache(db_path)
key1 = compute_cache_key("doc1", 0, "What is the budget?")
key2 = compute_cache_key("doc1", 0, "Who proposed it?")
cache.set_highlight(
cache_key=key1,
document_id="doc1",
chunk_index=0,
sub_question="What is the budget?",
relevant_sentences_json="[0]",
html_content="<p>budget</p>",
)
cache.set_highlight(
cache_key=key2,
document_id="doc1",
chunk_index=0,
sub_question="Who proposed it?",
relevant_sentences_json="[1]",
html_content="<p>proposer</p>",
)
assert cache.get_highlight(key1) == "<p>budget</p>"
assert cache.get_highlight(key2) == "<p>proposer</p>"