legco_ai_assistant/backend/app/test/test_phase5_highlight_cache.py

223 lines
6.9 KiB
Python

"""Tests for Phase 5.4 HighlightCache — SQLite cache for highlighted chunk HTML.
Covers:
- set_highlight stores HTML and relevant_sentences_json
- get_highlight retrieves cached HTML by cache key
- get_highlight returns None for missing cache key
- set_highlight overwrites existing entry (INSERT OR REPLACE)
- compute_cache_key produces same key for same inputs, different for different sub_questions
- compute_cache_key is deterministic (same inputs → same hash)
- get_highlight after set_highlight returns exact HTML content (no corruption)
- table creation is idempotent (init twice doesn't crash)
- multiple instances on same DB file share data correctly
Uses tmp_path for isolated test databases — no real filesystem pollution.
"""
import json
import pytest
from app.services.highlight_cache import HighlightCache, compute_cache_key
# ── compute_cache_key ──────────────────────────────────────────────────────
def test_compute_cache_key_same_inputs_same_hash():
key1 = compute_cache_key("doc1", 0, "What is the budget?")
key2 = compute_cache_key("doc1", 0, "What is the budget?")
assert key1 == key2
def test_compute_cache_key_different_sub_question_different_hash():
key1 = compute_cache_key("doc1", 0, "What is the budget?")
key2 = compute_cache_key("doc1", 0, "Who proposed it?")
assert key1 != key2
def test_compute_cache_key_different_document_id_different_hash():
key1 = compute_cache_key("doc1", 0, "What is the budget?")
key2 = compute_cache_key("doc2", 0, "What is the budget?")
assert key1 != key2
def test_compute_cache_key_different_chunk_index_different_hash():
key1 = compute_cache_key("doc1", 0, "What is the budget?")
key2 = compute_cache_key("doc1", 1, "What is the budget?")
assert key1 != key2
def test_compute_cache_key_is_64_char_hex():
key = compute_cache_key("doc1", 0, "What is the budget?")
assert len(key) == 64
assert all(c in "0123456789abcdef" for c in key)
# ── HighlightCache basic CRUD ──────────────────────────────────────────────
def test_set_and_get_highlight(tmp_path):
db_path = str(tmp_path / "highlights.db")
cache = HighlightCache(db_path)
cache.set_highlight(
cache_key="abc123",
document_id="doc1",
chunk_index=0,
sub_question="What is the budget?",
relevant_sentences_json=json.dumps([0, 1, 2]),
html_content="<p>highlighted</p>",
)
result = cache.get_highlight("abc123")
assert result == "<p>highlighted</p>"
def test_get_highlight_missing_returns_none(tmp_path):
db_path = str(tmp_path / "highlights.db")
cache = HighlightCache(db_path)
result = cache.get_highlight("nonexistent")
assert result is None
def test_set_highlight_overwrites_existing(tmp_path):
db_path = str(tmp_path / "highlights.db")
cache = HighlightCache(db_path)
cache.set_highlight(
cache_key="abc123",
document_id="doc1",
chunk_index=0,
sub_question="What is the budget?",
relevant_sentences_json=json.dumps([0, 1]),
html_content="<p>first</p>",
)
cache.set_highlight(
cache_key="abc123",
document_id="doc1",
chunk_index=0,
sub_question="What is the budget?",
relevant_sentences_json=json.dumps([2, 3]),
html_content="<p>second</p>",
)
result = cache.get_highlight("abc123")
assert result == "<p>second</p>"
def test_get_highlight_returns_exact_html_no_corruption(tmp_path):
db_path = str(tmp_path / "highlights.db")
cache = HighlightCache(db_path)
html = (
'<div class="chunk">\n'
' <p>Line one</p>\n'
' <mark>Line two</mark>\n'
' <p>Line three</p>\n'
'</div>'
)
cache.set_highlight(
cache_key="key1",
document_id="doc1",
chunk_index=0,
sub_question="Q?",
relevant_sentences_json=json.dumps([1]),
html_content=html,
)
result = cache.get_highlight("key1")
assert result == html
# ── Table init idempotency ─────────────────────────────────────────────────
def test_init_table_is_idempotent(tmp_path):
db_path = str(tmp_path / "highlights.db")
cache1 = HighlightCache(db_path)
cache1.set_highlight(
cache_key="k1",
document_id="d1",
chunk_index=0,
sub_question="Q1",
relevant_sentences_json="[]",
html_content="<p>hi</p>",
)
# Second init on same DB should not crash
cache2 = HighlightCache(db_path)
assert cache2.get_highlight("k1") == "<p>hi</p>"
# ── Multiple instances share data ──────────────────────────────────────────
def test_multiple_instances_share_same_db(tmp_path):
db_path = str(tmp_path / "highlights.db")
cache1 = HighlightCache(db_path)
cache2 = HighlightCache(db_path)
cache1.set_highlight(
cache_key="shared",
document_id="doc1",
chunk_index=0,
sub_question="Q?",
relevant_sentences_json="[]",
html_content="<p>shared</p>",
)
assert cache2.get_highlight("shared") == "<p>shared</p>"
# ── compute_cache_key integration with cache ───────────────────────────────
def test_compute_cache_key_and_round_trip(tmp_path):
db_path = str(tmp_path / "highlights.db")
cache = HighlightCache(db_path)
key = compute_cache_key("doc_42", 7, "What was the total spending?")
cache.set_highlight(
cache_key=key,
document_id="doc_42",
chunk_index=7,
sub_question="What was the total spending?",
relevant_sentences_json=json.dumps([3, 4]),
html_content="<mark>total</mark>",
)
assert cache.get_highlight(key) == "<mark>total</mark>"
def test_different_sub_questions_produce_different_cache_entries(tmp_path):
db_path = str(tmp_path / "highlights.db")
cache = HighlightCache(db_path)
key1 = compute_cache_key("doc1", 0, "What is the budget?")
key2 = compute_cache_key("doc1", 0, "Who proposed it?")
cache.set_highlight(
cache_key=key1,
document_id="doc1",
chunk_index=0,
sub_question="What is the budget?",
relevant_sentences_json="[0]",
html_content="<p>budget</p>",
)
cache.set_highlight(
cache_key=key2,
document_id="doc1",
chunk_index=0,
sub_question="Who proposed it?",
relevant_sentences_json="[1]",
html_content="<p>proposer</p>",
)
assert cache.get_highlight(key1) == "<p>budget</p>"
assert cache.get_highlight(key2) == "<p>proposer</p>"