feat: add SQLite highlight cache service (Phase 5.4.3)
- highlight_cache.py: HighlightCache class with get/set_highlight and compute_cache_key (sha256 hash of document_id|chunk_index|sub_question) - INSERT OR REPLACE semantics, idempotent table creation - 13 tests covering round-trip, overwrite, missing keys, determinism
This commit is contained in:
parent
b11d31e2d1
commit
bdbc8ea1a0
|
|
@ -0,0 +1,94 @@
|
|||
"""Highlight result cache.
|
||||
|
||||
Stores pre-computed highlighted chunk HTML pages in SQLite for instant retrieval.
|
||||
Uses sync sqlite3 — all operations are instant local reads/writes.
|
||||
Each method opens its own connection.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import sqlite3
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _connect(db_path: str) -> sqlite3.Connection:
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
return conn
|
||||
|
||||
|
||||
def compute_cache_key(document_id: str, chunk_index: int, sub_question: str) -> str:
|
||||
"""Deterministic cache key: sha256 hash of (document_id, chunk_index, sub_question)."""
|
||||
raw = f"{document_id}|{chunk_index}|{sub_question}"
|
||||
return hashlib.sha256(raw.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
class HighlightCache:
|
||||
def __init__(self, db_path: str) -> None:
|
||||
self._db_path = db_path
|
||||
self._init_table()
|
||||
|
||||
def _init_table(self) -> None:
|
||||
"""Create table if not exists (idempotent)."""
|
||||
with _connect(self._db_path) as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS chunk_highlights (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
cache_key TEXT UNIQUE NOT NULL,
|
||||
document_id TEXT NOT NULL,
|
||||
chunk_index INTEGER NOT NULL,
|
||||
sub_question TEXT NOT NULL,
|
||||
relevant_sentences_json TEXT NOT NULL,
|
||||
html_content TEXT NOT NULL,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_highlights_cache_key
|
||||
ON chunk_highlights(cache_key)
|
||||
"""
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
def get_highlight(self, cache_key: str) -> str | None:
|
||||
"""Retrieve cached HTML content by cache key. Returns None if not found."""
|
||||
with _connect(self._db_path) as conn:
|
||||
row = conn.execute(
|
||||
"SELECT html_content FROM chunk_highlights WHERE cache_key = ?",
|
||||
(cache_key,),
|
||||
).fetchone()
|
||||
if row is None:
|
||||
return None
|
||||
return row["html_content"]
|
||||
|
||||
def set_highlight(
|
||||
self,
|
||||
cache_key: str,
|
||||
document_id: str,
|
||||
chunk_index: int,
|
||||
sub_question: str,
|
||||
relevant_sentences_json: str,
|
||||
html_content: str,
|
||||
) -> None:
|
||||
"""Store highlighted HTML in cache. Overwrites existing entry."""
|
||||
with _connect(self._db_path) as conn:
|
||||
conn.execute(
|
||||
"""INSERT OR REPLACE INTO chunk_highlights
|
||||
(cache_key, document_id, chunk_index, sub_question,
|
||||
relevant_sentences_json, html_content)
|
||||
VALUES (?, ?, ?, ?, ?, ?)""",
|
||||
(
|
||||
cache_key,
|
||||
document_id,
|
||||
chunk_index,
|
||||
sub_question,
|
||||
relevant_sentences_json,
|
||||
html_content,
|
||||
),
|
||||
)
|
||||
conn.commit()
|
||||
|
|
@ -0,0 +1,222 @@
|
|||
"""Tests for Phase 5.4 HighlightCache — SQLite cache for highlighted chunk HTML.
|
||||
|
||||
Covers:
|
||||
- set_highlight stores HTML and relevant_sentences_json
|
||||
- get_highlight retrieves cached HTML by cache key
|
||||
- get_highlight returns None for missing cache key
|
||||
- set_highlight overwrites existing entry (INSERT OR REPLACE)
|
||||
- compute_cache_key produces same key for same inputs, different for different sub_questions
|
||||
- compute_cache_key is deterministic (same inputs → same hash)
|
||||
- get_highlight after set_highlight returns exact HTML content (no corruption)
|
||||
- table creation is idempotent (init twice doesn't crash)
|
||||
- multiple instances on same DB file share data correctly
|
||||
|
||||
Uses tmp_path for isolated test databases — no real filesystem pollution.
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from app.services.highlight_cache import HighlightCache, compute_cache_key
|
||||
|
||||
|
||||
# ── compute_cache_key ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_compute_cache_key_same_inputs_same_hash():
|
||||
key1 = compute_cache_key("doc1", 0, "What is the budget?")
|
||||
key2 = compute_cache_key("doc1", 0, "What is the budget?")
|
||||
assert key1 == key2
|
||||
|
||||
|
||||
def test_compute_cache_key_different_sub_question_different_hash():
|
||||
key1 = compute_cache_key("doc1", 0, "What is the budget?")
|
||||
key2 = compute_cache_key("doc1", 0, "Who proposed it?")
|
||||
assert key1 != key2
|
||||
|
||||
|
||||
def test_compute_cache_key_different_document_id_different_hash():
|
||||
key1 = compute_cache_key("doc1", 0, "What is the budget?")
|
||||
key2 = compute_cache_key("doc2", 0, "What is the budget?")
|
||||
assert key1 != key2
|
||||
|
||||
|
||||
def test_compute_cache_key_different_chunk_index_different_hash():
|
||||
key1 = compute_cache_key("doc1", 0, "What is the budget?")
|
||||
key2 = compute_cache_key("doc1", 1, "What is the budget?")
|
||||
assert key1 != key2
|
||||
|
||||
|
||||
def test_compute_cache_key_is_64_char_hex():
|
||||
key = compute_cache_key("doc1", 0, "What is the budget?")
|
||||
assert len(key) == 64
|
||||
assert all(c in "0123456789abcdef" for c in key)
|
||||
|
||||
|
||||
# ── HighlightCache basic CRUD ──────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_set_and_get_highlight(tmp_path):
|
||||
db_path = str(tmp_path / "highlights.db")
|
||||
cache = HighlightCache(db_path)
|
||||
|
||||
cache.set_highlight(
|
||||
cache_key="abc123",
|
||||
document_id="doc1",
|
||||
chunk_index=0,
|
||||
sub_question="What is the budget?",
|
||||
relevant_sentences_json=json.dumps([0, 1, 2]),
|
||||
html_content="<p>highlighted</p>",
|
||||
)
|
||||
|
||||
result = cache.get_highlight("abc123")
|
||||
assert result == "<p>highlighted</p>"
|
||||
|
||||
|
||||
def test_get_highlight_missing_returns_none(tmp_path):
|
||||
db_path = str(tmp_path / "highlights.db")
|
||||
cache = HighlightCache(db_path)
|
||||
|
||||
result = cache.get_highlight("nonexistent")
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_set_highlight_overwrites_existing(tmp_path):
|
||||
db_path = str(tmp_path / "highlights.db")
|
||||
cache = HighlightCache(db_path)
|
||||
|
||||
cache.set_highlight(
|
||||
cache_key="abc123",
|
||||
document_id="doc1",
|
||||
chunk_index=0,
|
||||
sub_question="What is the budget?",
|
||||
relevant_sentences_json=json.dumps([0, 1]),
|
||||
html_content="<p>first</p>",
|
||||
)
|
||||
|
||||
cache.set_highlight(
|
||||
cache_key="abc123",
|
||||
document_id="doc1",
|
||||
chunk_index=0,
|
||||
sub_question="What is the budget?",
|
||||
relevant_sentences_json=json.dumps([2, 3]),
|
||||
html_content="<p>second</p>",
|
||||
)
|
||||
|
||||
result = cache.get_highlight("abc123")
|
||||
assert result == "<p>second</p>"
|
||||
|
||||
|
||||
def test_get_highlight_returns_exact_html_no_corruption(tmp_path):
|
||||
db_path = str(tmp_path / "highlights.db")
|
||||
cache = HighlightCache(db_path)
|
||||
|
||||
html = (
|
||||
'<div class="chunk">\n'
|
||||
' <p>Line one</p>\n'
|
||||
' <mark>Line two</mark>\n'
|
||||
' <p>Line three</p>\n'
|
||||
'</div>'
|
||||
)
|
||||
|
||||
cache.set_highlight(
|
||||
cache_key="key1",
|
||||
document_id="doc1",
|
||||
chunk_index=0,
|
||||
sub_question="Q?",
|
||||
relevant_sentences_json=json.dumps([1]),
|
||||
html_content=html,
|
||||
)
|
||||
|
||||
result = cache.get_highlight("key1")
|
||||
assert result == html
|
||||
|
||||
|
||||
# ── Table init idempotency ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_init_table_is_idempotent(tmp_path):
|
||||
db_path = str(tmp_path / "highlights.db")
|
||||
cache1 = HighlightCache(db_path)
|
||||
cache1.set_highlight(
|
||||
cache_key="k1",
|
||||
document_id="d1",
|
||||
chunk_index=0,
|
||||
sub_question="Q1",
|
||||
relevant_sentences_json="[]",
|
||||
html_content="<p>hi</p>",
|
||||
)
|
||||
|
||||
# Second init on same DB should not crash
|
||||
cache2 = HighlightCache(db_path)
|
||||
assert cache2.get_highlight("k1") == "<p>hi</p>"
|
||||
|
||||
|
||||
# ── Multiple instances share data ──────────────────────────────────────────
|
||||
|
||||
|
||||
def test_multiple_instances_share_same_db(tmp_path):
|
||||
db_path = str(tmp_path / "highlights.db")
|
||||
cache1 = HighlightCache(db_path)
|
||||
cache2 = HighlightCache(db_path)
|
||||
|
||||
cache1.set_highlight(
|
||||
cache_key="shared",
|
||||
document_id="doc1",
|
||||
chunk_index=0,
|
||||
sub_question="Q?",
|
||||
relevant_sentences_json="[]",
|
||||
html_content="<p>shared</p>",
|
||||
)
|
||||
|
||||
assert cache2.get_highlight("shared") == "<p>shared</p>"
|
||||
|
||||
|
||||
# ── compute_cache_key integration with cache ───────────────────────────────
|
||||
|
||||
|
||||
def test_compute_cache_key_and_round_trip(tmp_path):
|
||||
db_path = str(tmp_path / "highlights.db")
|
||||
cache = HighlightCache(db_path)
|
||||
|
||||
key = compute_cache_key("doc_42", 7, "What was the total spending?")
|
||||
cache.set_highlight(
|
||||
cache_key=key,
|
||||
document_id="doc_42",
|
||||
chunk_index=7,
|
||||
sub_question="What was the total spending?",
|
||||
relevant_sentences_json=json.dumps([3, 4]),
|
||||
html_content="<mark>total</mark>",
|
||||
)
|
||||
|
||||
assert cache.get_highlight(key) == "<mark>total</mark>"
|
||||
|
||||
|
||||
def test_different_sub_questions_produce_different_cache_entries(tmp_path):
|
||||
db_path = str(tmp_path / "highlights.db")
|
||||
cache = HighlightCache(db_path)
|
||||
|
||||
key1 = compute_cache_key("doc1", 0, "What is the budget?")
|
||||
key2 = compute_cache_key("doc1", 0, "Who proposed it?")
|
||||
|
||||
cache.set_highlight(
|
||||
cache_key=key1,
|
||||
document_id="doc1",
|
||||
chunk_index=0,
|
||||
sub_question="What is the budget?",
|
||||
relevant_sentences_json="[0]",
|
||||
html_content="<p>budget</p>",
|
||||
)
|
||||
|
||||
cache.set_highlight(
|
||||
cache_key=key2,
|
||||
document_id="doc1",
|
||||
chunk_index=0,
|
||||
sub_question="Who proposed it?",
|
||||
relevant_sentences_json="[1]",
|
||||
html_content="<p>proposer</p>",
|
||||
)
|
||||
|
||||
assert cache.get_highlight(key1) == "<p>budget</p>"
|
||||
assert cache.get_highlight(key2) == "<p>proposer</p>"
|
||||
Loading…
Reference in New Issue