"""Phase 1.5.4: Enhanced metadata tests. Tests for extract_metadata() with new page_number, chunk_file_path, and document_id fields. These fields are optional and maintain backward compatibility with existing callers. """ from pathlib import Path from datetime import datetime import pytest import importlib.util # Dynamically load the metadata module (same pattern as test_phase1_metadata.py). MODULE_PATH = Path(__file__).resolve().parents[1] / "utils" / "metadata.py" spec = importlib.util.spec_from_file_location("metadata_module", str(MODULE_PATH)) assert spec is not None and spec.loader is not None metadata_module = importlib.util.module_from_spec(spec) # type: ignore spec.loader.exec_module(metadata_module) # type: ignore extract_metadata = getattr(metadata_module, "extract_metadata") def _is_iso8601(s: str) -> bool: try: datetime.fromisoformat(s) return True except ValueError: return False # ── Test cases ────────────────────────────────────────────────────────── def test_extract_metadata_with_page_numbers(tmp_path): """Each chunk should receive the corresponding page_number.""" dummy_file = tmp_path / "doc.pdf" dummy_file.write_text("pdf content") chunks = ["chunk A", "chunk B", "chunk C"] page_numbers = [1, 2, 3] metadata = extract_metadata( str(dummy_file), chunks, page_numbers=page_numbers ) assert len(metadata) == 3 for i, m in enumerate(metadata): assert m["page_number"] == page_numbers[i] # Existing fields must still be present assert "filename" in m assert "chunk_index" in m def test_extract_metadata_with_chunk_file_paths(tmp_path): """Each chunk should receive the corresponding chunk_file_path.""" dummy_file = tmp_path / "report.pdf" dummy_file.write_text("report data") chunks = ["page-1 text", "page-2 text"] chunk_file_paths = ["report_page_1.pdf", "report_page_2.pdf"] metadata = extract_metadata( str(dummy_file), chunks, chunk_file_paths=chunk_file_paths ) assert len(metadata) == 2 for i, m in enumerate(metadata): assert m["chunk_file_path"] == chunk_file_paths[i] def test_extract_metadata_with_document_id(tmp_path): """All metadata dicts should contain the same document_id.""" dummy_file = tmp_path / "memo.docx" dummy_file.write_text("memo") chunks = ["section A", "section B"] document_id = "test-uuid-123" metadata = extract_metadata( str(dummy_file), chunks, document_id=document_id ) assert len(metadata) == 2 for m in metadata: assert m["document_id"] == "test-uuid-123" def test_extract_metadata_all_new_fields(tmp_path): """Pass all three new params together and verify complete structure.""" dummy_file = tmp_path / "full.pdf" dummy_file.write_text("full doc") chunks = ["alpha", "beta", "gamma"] page_numbers = [5, 6, 7] chunk_file_paths = ["full_p5.txt", "full_p6.txt", "full_p7.txt"] document_id = "uuid-all-fields" metadata = extract_metadata( str(dummy_file), chunks, page_numbers=page_numbers, chunk_file_paths=chunk_file_paths, document_id=document_id, ) assert len(metadata) == 3 for i, m in enumerate(metadata): # New fields assert m["page_number"] == page_numbers[i] assert m["chunk_file_path"] == chunk_file_paths[i] assert m["document_id"] == document_id # Old fields must still be present and correct assert m["filename"] == "full.pdf" assert m["chunk_index"] == i assert _is_iso8601(m["upload_date"]) assert m["content_summary"] == chunks[i] def test_extract_metadata_without_new_fields(tmp_path): """Calling without any new params must be backward-compatible. The new fields (page_number, chunk_file_path, document_id) should not appear in the metadata dicts when not explicitly requested. """ dummy_file = tmp_path / "legacy.txt" dummy_file.write_text("legacy") chunks = ["old-style chunk"] metadata = extract_metadata(str(dummy_file), chunks) assert len(metadata) == 1 m = metadata[0] # New fields should be absent (or None — implementation choice). # Following the plan: these fields are optional additions. assert m.get("page_number") is None assert m.get("chunk_file_path") is None assert m.get("document_id") is None # Core fields intact assert m["filename"] == "legacy.txt" assert m["chunk_index"] == 0 def test_extract_metadata_page_numbers_length_mismatch(tmp_path): """If page_numbers length != chunks length, should raise ValueError.""" dummy_file = tmp_path / "mismatch.pdf" dummy_file.write_text("data") chunks = ["a", "b", "c"] page_numbers = [1, 2] # Only 2 for 3 chunks with pytest.raises(ValueError, match="page_numbers"): extract_metadata(str(dummy_file), chunks, page_numbers=page_numbers) def test_extract_metadata_chunk_file_paths_length_mismatch(tmp_path): """If chunk_file_paths length != chunks length, should raise ValueError.""" dummy_file = tmp_path / "mismatch2.pdf" dummy_file.write_text("data") chunks = ["x", "y"] chunk_file_paths = ["only_one.pdf"] # 1 for 2 chunks with pytest.raises(ValueError, match="chunk_file_paths"): extract_metadata( str(dummy_file), chunks, chunk_file_paths=chunk_file_paths ) def test_extract_metadata_page_numbers_none_in_list(tmp_path): """page_numbers can contain None for chunks without page info (e.g. DOCX).""" dummy_file = tmp_path / "mixed.docx" dummy_file.write_text("docx content") chunks = ["cover page", "body text"] page_numbers = [None, 1] metadata = extract_metadata( str(dummy_file), chunks, page_numbers=page_numbers ) assert len(metadata) == 2 assert "page_number" not in metadata[0] assert metadata[1]["page_number"] == 1