"""Tests for Phase 5.4 HighlightCache — SQLite cache for highlighted chunk HTML. Covers: - set_highlight stores HTML and relevant_sentences_json - get_highlight retrieves cached HTML by cache key - get_highlight returns None for missing cache key - set_highlight overwrites existing entry (INSERT OR REPLACE) - compute_cache_key produces same key for same inputs, different for different sub_questions - compute_cache_key is deterministic (same inputs → same hash) - get_highlight after set_highlight returns exact HTML content (no corruption) - table creation is idempotent (init twice doesn't crash) - multiple instances on same DB file share data correctly Uses tmp_path for isolated test databases — no real filesystem pollution. """ import json import pytest from app.services.highlight_cache import HighlightCache, compute_cache_key # ── compute_cache_key ────────────────────────────────────────────────────── def test_compute_cache_key_same_inputs_same_hash(): key1 = compute_cache_key("doc1", 0, "What is the budget?") key2 = compute_cache_key("doc1", 0, "What is the budget?") assert key1 == key2 def test_compute_cache_key_different_sub_question_different_hash(): key1 = compute_cache_key("doc1", 0, "What is the budget?") key2 = compute_cache_key("doc1", 0, "Who proposed it?") assert key1 != key2 def test_compute_cache_key_different_document_id_different_hash(): key1 = compute_cache_key("doc1", 0, "What is the budget?") key2 = compute_cache_key("doc2", 0, "What is the budget?") assert key1 != key2 def test_compute_cache_key_different_chunk_index_different_hash(): key1 = compute_cache_key("doc1", 0, "What is the budget?") key2 = compute_cache_key("doc1", 1, "What is the budget?") assert key1 != key2 def test_compute_cache_key_is_64_char_hex(): key = compute_cache_key("doc1", 0, "What is the budget?") assert len(key) == 64 assert all(c in "0123456789abcdef" for c in key) # ── HighlightCache basic CRUD ────────────────────────────────────────────── def test_set_and_get_highlight(tmp_path): db_path = str(tmp_path / "highlights.db") cache = HighlightCache(db_path) cache.set_highlight( cache_key="abc123", document_id="doc1", chunk_index=0, sub_question="What is the budget?", relevant_sentences_json=json.dumps([0, 1, 2]), html_content="

highlighted

", ) result = cache.get_highlight("abc123") assert result == "

highlighted

" def test_get_highlight_missing_returns_none(tmp_path): db_path = str(tmp_path / "highlights.db") cache = HighlightCache(db_path) result = cache.get_highlight("nonexistent") assert result is None def test_set_highlight_overwrites_existing(tmp_path): db_path = str(tmp_path / "highlights.db") cache = HighlightCache(db_path) cache.set_highlight( cache_key="abc123", document_id="doc1", chunk_index=0, sub_question="What is the budget?", relevant_sentences_json=json.dumps([0, 1]), html_content="

first

", ) cache.set_highlight( cache_key="abc123", document_id="doc1", chunk_index=0, sub_question="What is the budget?", relevant_sentences_json=json.dumps([2, 3]), html_content="

second

", ) result = cache.get_highlight("abc123") assert result == "

second

" def test_get_highlight_returns_exact_html_no_corruption(tmp_path): db_path = str(tmp_path / "highlights.db") cache = HighlightCache(db_path) html = ( '
\n' '

Line one

\n' ' Line two\n' '

Line three

\n' '
' ) cache.set_highlight( cache_key="key1", document_id="doc1", chunk_index=0, sub_question="Q?", relevant_sentences_json=json.dumps([1]), html_content=html, ) result = cache.get_highlight("key1") assert result == html # ── Table init idempotency ───────────────────────────────────────────────── def test_init_table_is_idempotent(tmp_path): db_path = str(tmp_path / "highlights.db") cache1 = HighlightCache(db_path) cache1.set_highlight( cache_key="k1", document_id="d1", chunk_index=0, sub_question="Q1", relevant_sentences_json="[]", html_content="

hi

", ) # Second init on same DB should not crash cache2 = HighlightCache(db_path) assert cache2.get_highlight("k1") == "

hi

" # ── Multiple instances share data ────────────────────────────────────────── def test_multiple_instances_share_same_db(tmp_path): db_path = str(tmp_path / "highlights.db") cache1 = HighlightCache(db_path) cache2 = HighlightCache(db_path) cache1.set_highlight( cache_key="shared", document_id="doc1", chunk_index=0, sub_question="Q?", relevant_sentences_json="[]", html_content="

shared

", ) assert cache2.get_highlight("shared") == "

shared

" # ── compute_cache_key integration with cache ─────────────────────────────── def test_compute_cache_key_and_round_trip(tmp_path): db_path = str(tmp_path / "highlights.db") cache = HighlightCache(db_path) key = compute_cache_key("doc_42", 7, "What was the total spending?") cache.set_highlight( cache_key=key, document_id="doc_42", chunk_index=7, sub_question="What was the total spending?", relevant_sentences_json=json.dumps([3, 4]), html_content="total", ) assert cache.get_highlight(key) == "total" def test_different_sub_questions_produce_different_cache_entries(tmp_path): db_path = str(tmp_path / "highlights.db") cache = HighlightCache(db_path) key1 = compute_cache_key("doc1", 0, "What is the budget?") key2 = compute_cache_key("doc1", 0, "Who proposed it?") cache.set_highlight( cache_key=key1, document_id="doc1", chunk_index=0, sub_question="What is the budget?", relevant_sentences_json="[0]", html_content="

budget

", ) cache.set_highlight( cache_key=key2, document_id="doc1", chunk_index=0, sub_question="Who proposed it?", relevant_sentences_json="[1]", html_content="

proposer

", ) assert cache.get_highlight(key1) == "

budget

" assert cache.get_highlight(key2) == "

proposer

"