From bdbc8ea1a017a402f0059657cf3c801f4fd87fee Mon Sep 17 00:00:00 2001 From: Woody Date: Wed, 29 Apr 2026 09:26:20 +0800 Subject: [PATCH] feat: add SQLite highlight cache service (Phase 5.4.3) - highlight_cache.py: HighlightCache class with get/set_highlight and compute_cache_key (sha256 hash of document_id|chunk_index|sub_question) - INSERT OR REPLACE semantics, idempotent table creation - 13 tests covering round-trip, overwrite, missing keys, determinism --- backend/app/services/highlight_cache.py | 94 ++++++++ .../app/test/test_phase5_highlight_cache.py | 222 ++++++++++++++++++ 2 files changed, 316 insertions(+) create mode 100644 backend/app/services/highlight_cache.py create mode 100644 backend/app/test/test_phase5_highlight_cache.py diff --git a/backend/app/services/highlight_cache.py b/backend/app/services/highlight_cache.py new file mode 100644 index 0000000..554b444 --- /dev/null +++ b/backend/app/services/highlight_cache.py @@ -0,0 +1,94 @@ +"""Highlight result cache. + +Stores pre-computed highlighted chunk HTML pages in SQLite for instant retrieval. +Uses sync sqlite3 — all operations are instant local reads/writes. +Each method opens its own connection. +""" + +import hashlib +import json +import logging +import sqlite3 + +logger = logging.getLogger(__name__) + + +def _connect(db_path: str) -> sqlite3.Connection: + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + return conn + + +def compute_cache_key(document_id: str, chunk_index: int, sub_question: str) -> str: + """Deterministic cache key: sha256 hash of (document_id, chunk_index, sub_question).""" + raw = f"{document_id}|{chunk_index}|{sub_question}" + return hashlib.sha256(raw.encode("utf-8")).hexdigest() + + +class HighlightCache: + def __init__(self, db_path: str) -> None: + self._db_path = db_path + self._init_table() + + def _init_table(self) -> None: + """Create table if not exists (idempotent).""" + with _connect(self._db_path) as conn: + conn.execute( + """ + CREATE TABLE IF NOT EXISTS chunk_highlights ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + cache_key TEXT UNIQUE NOT NULL, + document_id TEXT NOT NULL, + chunk_index INTEGER NOT NULL, + sub_question TEXT NOT NULL, + relevant_sentences_json TEXT NOT NULL, + html_content TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + """ + ) + conn.execute( + """ + CREATE INDEX IF NOT EXISTS idx_highlights_cache_key + ON chunk_highlights(cache_key) + """ + ) + conn.commit() + + def get_highlight(self, cache_key: str) -> str | None: + """Retrieve cached HTML content by cache key. Returns None if not found.""" + with _connect(self._db_path) as conn: + row = conn.execute( + "SELECT html_content FROM chunk_highlights WHERE cache_key = ?", + (cache_key,), + ).fetchone() + if row is None: + return None + return row["html_content"] + + def set_highlight( + self, + cache_key: str, + document_id: str, + chunk_index: int, + sub_question: str, + relevant_sentences_json: str, + html_content: str, + ) -> None: + """Store highlighted HTML in cache. Overwrites existing entry.""" + with _connect(self._db_path) as conn: + conn.execute( + """INSERT OR REPLACE INTO chunk_highlights + (cache_key, document_id, chunk_index, sub_question, + relevant_sentences_json, html_content) + VALUES (?, ?, ?, ?, ?, ?)""", + ( + cache_key, + document_id, + chunk_index, + sub_question, + relevant_sentences_json, + html_content, + ), + ) + conn.commit() diff --git a/backend/app/test/test_phase5_highlight_cache.py b/backend/app/test/test_phase5_highlight_cache.py new file mode 100644 index 0000000..90531f0 --- /dev/null +++ b/backend/app/test/test_phase5_highlight_cache.py @@ -0,0 +1,222 @@ +"""Tests for Phase 5.4 HighlightCache — SQLite cache for highlighted chunk HTML. + +Covers: +- set_highlight stores HTML and relevant_sentences_json +- get_highlight retrieves cached HTML by cache key +- get_highlight returns None for missing cache key +- set_highlight overwrites existing entry (INSERT OR REPLACE) +- compute_cache_key produces same key for same inputs, different for different sub_questions +- compute_cache_key is deterministic (same inputs → same hash) +- get_highlight after set_highlight returns exact HTML content (no corruption) +- table creation is idempotent (init twice doesn't crash) +- multiple instances on same DB file share data correctly + +Uses tmp_path for isolated test databases — no real filesystem pollution. +""" + +import json + +import pytest + +from app.services.highlight_cache import HighlightCache, compute_cache_key + + +# ── compute_cache_key ────────────────────────────────────────────────────── + + +def test_compute_cache_key_same_inputs_same_hash(): + key1 = compute_cache_key("doc1", 0, "What is the budget?") + key2 = compute_cache_key("doc1", 0, "What is the budget?") + assert key1 == key2 + + +def test_compute_cache_key_different_sub_question_different_hash(): + key1 = compute_cache_key("doc1", 0, "What is the budget?") + key2 = compute_cache_key("doc1", 0, "Who proposed it?") + assert key1 != key2 + + +def test_compute_cache_key_different_document_id_different_hash(): + key1 = compute_cache_key("doc1", 0, "What is the budget?") + key2 = compute_cache_key("doc2", 0, "What is the budget?") + assert key1 != key2 + + +def test_compute_cache_key_different_chunk_index_different_hash(): + key1 = compute_cache_key("doc1", 0, "What is the budget?") + key2 = compute_cache_key("doc1", 1, "What is the budget?") + assert key1 != key2 + + +def test_compute_cache_key_is_64_char_hex(): + key = compute_cache_key("doc1", 0, "What is the budget?") + assert len(key) == 64 + assert all(c in "0123456789abcdef" for c in key) + + +# ── HighlightCache basic CRUD ────────────────────────────────────────────── + + +def test_set_and_get_highlight(tmp_path): + db_path = str(tmp_path / "highlights.db") + cache = HighlightCache(db_path) + + cache.set_highlight( + cache_key="abc123", + document_id="doc1", + chunk_index=0, + sub_question="What is the budget?", + relevant_sentences_json=json.dumps([0, 1, 2]), + html_content="

highlighted

", + ) + + result = cache.get_highlight("abc123") + assert result == "

highlighted

" + + +def test_get_highlight_missing_returns_none(tmp_path): + db_path = str(tmp_path / "highlights.db") + cache = HighlightCache(db_path) + + result = cache.get_highlight("nonexistent") + assert result is None + + +def test_set_highlight_overwrites_existing(tmp_path): + db_path = str(tmp_path / "highlights.db") + cache = HighlightCache(db_path) + + cache.set_highlight( + cache_key="abc123", + document_id="doc1", + chunk_index=0, + sub_question="What is the budget?", + relevant_sentences_json=json.dumps([0, 1]), + html_content="

first

", + ) + + cache.set_highlight( + cache_key="abc123", + document_id="doc1", + chunk_index=0, + sub_question="What is the budget?", + relevant_sentences_json=json.dumps([2, 3]), + html_content="

second

", + ) + + result = cache.get_highlight("abc123") + assert result == "

second

" + + +def test_get_highlight_returns_exact_html_no_corruption(tmp_path): + db_path = str(tmp_path / "highlights.db") + cache = HighlightCache(db_path) + + html = ( + '
\n' + '

Line one

\n' + ' Line two\n' + '

Line three

\n' + '
' + ) + + cache.set_highlight( + cache_key="key1", + document_id="doc1", + chunk_index=0, + sub_question="Q?", + relevant_sentences_json=json.dumps([1]), + html_content=html, + ) + + result = cache.get_highlight("key1") + assert result == html + + +# ── Table init idempotency ───────────────────────────────────────────────── + + +def test_init_table_is_idempotent(tmp_path): + db_path = str(tmp_path / "highlights.db") + cache1 = HighlightCache(db_path) + cache1.set_highlight( + cache_key="k1", + document_id="d1", + chunk_index=0, + sub_question="Q1", + relevant_sentences_json="[]", + html_content="

hi

", + ) + + # Second init on same DB should not crash + cache2 = HighlightCache(db_path) + assert cache2.get_highlight("k1") == "

hi

" + + +# ── Multiple instances share data ────────────────────────────────────────── + + +def test_multiple_instances_share_same_db(tmp_path): + db_path = str(tmp_path / "highlights.db") + cache1 = HighlightCache(db_path) + cache2 = HighlightCache(db_path) + + cache1.set_highlight( + cache_key="shared", + document_id="doc1", + chunk_index=0, + sub_question="Q?", + relevant_sentences_json="[]", + html_content="

shared

", + ) + + assert cache2.get_highlight("shared") == "

shared

" + + +# ── compute_cache_key integration with cache ─────────────────────────────── + + +def test_compute_cache_key_and_round_trip(tmp_path): + db_path = str(tmp_path / "highlights.db") + cache = HighlightCache(db_path) + + key = compute_cache_key("doc_42", 7, "What was the total spending?") + cache.set_highlight( + cache_key=key, + document_id="doc_42", + chunk_index=7, + sub_question="What was the total spending?", + relevant_sentences_json=json.dumps([3, 4]), + html_content="total", + ) + + assert cache.get_highlight(key) == "total" + + +def test_different_sub_questions_produce_different_cache_entries(tmp_path): + db_path = str(tmp_path / "highlights.db") + cache = HighlightCache(db_path) + + key1 = compute_cache_key("doc1", 0, "What is the budget?") + key2 = compute_cache_key("doc1", 0, "Who proposed it?") + + cache.set_highlight( + cache_key=key1, + document_id="doc1", + chunk_index=0, + sub_question="What is the budget?", + relevant_sentences_json="[0]", + html_content="

budget

", + ) + + cache.set_highlight( + cache_key=key2, + document_id="doc1", + chunk_index=0, + sub_question="Who proposed it?", + relevant_sentences_json="[1]", + html_content="

proposer

", + ) + + assert cache.get_highlight(key1) == "

budget

" + assert cache.get_highlight(key2) == "

proposer

"