"""Phase 8 tests: Q&A metadata enrichment (Sub-Phase 8.2). Covers: - Metadata enrichment with Q&A-specific fields via chunk_metadata param - Backward compatibility: token strategy unchanged - Page number references question location - Chunk metadata merging with base metadata """ import json import pytest from app.utils.metadata import extract_metadata def test_qa_metadata_fields(tmp_path): """strategy_type, question_index, question_id, question_text merged via chunk_metadata.""" file_path = tmp_path / "test.pdf" file_path.write_text("dummy content") chunks = ["chunk 1", "chunk 2"] chunk_metadata = [ { "strategy_type": "question", "section_type": "qa", "question_index": 0, "question_id": "A1", "question_text": "What is X?", "section_heading": "(A) Section", "answer_contains_table": True, "source_page_range": [2, 5], "parent_topic": "Topic Name", }, { "strategy_type": "question", "section_type": "qa", "question_index": 1, "question_id": "A2", "question_text": "What is Y?", "section_heading": "(A) Section", "answer_contains_table": False, "source_page_range": [5, 7], }, ] metadata = extract_metadata( file_path=str(file_path), chunks=chunks, strategy_type="question", chunk_metadata=chunk_metadata, ) assert len(metadata) == 2 m0 = metadata[0] assert m0["strategy_type"] == "question" assert m0["section_type"] == "qa" assert m0["question_index"] == 0 assert m0["question_id"] == "A1" assert m0["question_text"] == "What is X?" assert m0["section_heading"] == "(A) Section" assert m0["answer_contains_table"] is True assert m0["source_page_range"] == [2, 5] assert m0["parent_topic"] == "Topic Name" m1 = metadata[1] assert m1["question_index"] == 1 assert m1["question_id"] == "A2" assert m1["answer_contains_table"] is False def test_qa_metadata_topic_section(tmp_path): """section_heading and parent_topic are both preserved.""" file_path = tmp_path / "test.pdf" file_path.write_text("dummy content") metadata = extract_metadata( file_path=str(file_path), chunks=["chunk"], strategy_type="question", chunk_metadata=[{"section_heading": "(B) Traffic", "parent_topic": "Traffic Planning"}], ) assert metadata[0]["section_heading"] == "(B) Traffic" assert metadata[0]["parent_topic"] == "Traffic Planning" def test_token_metadata_unchanged(tmp_path): """Existing metadata fields unchanged for token strategy (no chunk_metadata).""" file_path = tmp_path / "test.txt" file_path.write_text("test content") metadata = extract_metadata( file_path=str(file_path), chunks=["chunk 1", "chunk 2"], original_filename="original.txt", strategy_type="token", ) assert len(metadata) == 2 for m in metadata: assert "filename" in m assert "upload_date" in m assert "content_summary" in m assert "chunk_index" in m assert m.get("strategy_type", "token") == "token" assert "question_id" not in m def test_page_number_from_question(tmp_path): """Page ref should point to question location (pass via page_numbers from strategy).""" file_path = tmp_path / "test.pdf" file_path.write_text("dummy content") metadata = extract_metadata( file_path=str(file_path), chunks=["question chunk"], page_numbers=[3], strategy_type="question", chunk_metadata=[{ "question_id": "A1", "source_page_range": [3, 8], }], ) assert metadata[0]["page_number"] == 3 assert metadata[0]["source_page_range"] == [3, 8] def test_chunk_metadata_length_mismatch(tmp_path): """chunk_metadata length mismatch with chunks raises ValueError.""" file_path = tmp_path / "test.pdf" file_path.write_text("dummy content") with pytest.raises(ValueError, match="chunk_metadata length"): extract_metadata( file_path=str(file_path), chunks=["a", "b", "c"], chunk_metadata=[{}, {}], ) def test_chunk_metadata_empty_no_error(tmp_path): """Empty chunk_metadata list with matching chunks is valid.""" file_path = tmp_path / "test.pdf" file_path.write_text("dummy content") metadata = extract_metadata( file_path=str(file_path), chunks=["a"], chunk_metadata=[], ) assert len(metadata) == 1