legco_ai_assistant/backend/app/test/test_phase8_metadata.py

150 lines
4.6 KiB
Python

"""Phase 8 tests: Q&A metadata enrichment (Sub-Phase 8.2).
Covers:
- Metadata enrichment with Q&A-specific fields via chunk_metadata param
- Backward compatibility: token strategy unchanged
- Page number references question location
- Chunk metadata merging with base metadata
"""
import json
import pytest
from app.utils.metadata import extract_metadata
def test_qa_metadata_fields(tmp_path):
"""strategy_type, question_index, question_id, question_text merged via chunk_metadata."""
file_path = tmp_path / "test.pdf"
file_path.write_text("dummy content")
chunks = ["chunk 1", "chunk 2"]
chunk_metadata = [
{
"strategy_type": "question",
"section_type": "qa",
"question_index": 0,
"question_id": "A1",
"question_text": "What is X?",
"section_heading": "(A) Section",
"answer_contains_table": True,
"source_page_range": [2, 5],
"parent_topic": "Topic Name",
},
{
"strategy_type": "question",
"section_type": "qa",
"question_index": 1,
"question_id": "A2",
"question_text": "What is Y?",
"section_heading": "(A) Section",
"answer_contains_table": False,
"source_page_range": [5, 7],
},
]
metadata = extract_metadata(
file_path=str(file_path),
chunks=chunks,
strategy_type="question",
chunk_metadata=chunk_metadata,
)
assert len(metadata) == 2
m0 = metadata[0]
assert m0["strategy_type"] == "question"
assert m0["section_type"] == "qa"
assert m0["question_index"] == 0
assert m0["question_id"] == "A1"
assert m0["question_text"] == "What is X?"
assert m0["section_heading"] == "(A) Section"
assert m0["answer_contains_table"] is True
assert m0["source_page_range"] == [2, 5]
assert m0["parent_topic"] == "Topic Name"
m1 = metadata[1]
assert m1["question_index"] == 1
assert m1["question_id"] == "A2"
assert m1["answer_contains_table"] is False
def test_qa_metadata_topic_section(tmp_path):
"""section_heading and parent_topic are both preserved."""
file_path = tmp_path / "test.pdf"
file_path.write_text("dummy content")
metadata = extract_metadata(
file_path=str(file_path),
chunks=["chunk"],
strategy_type="question",
chunk_metadata=[{"section_heading": "(B) Traffic", "parent_topic": "Traffic Planning"}],
)
assert metadata[0]["section_heading"] == "(B) Traffic"
assert metadata[0]["parent_topic"] == "Traffic Planning"
def test_token_metadata_unchanged(tmp_path):
"""Existing metadata fields unchanged for token strategy (no chunk_metadata)."""
file_path = tmp_path / "test.txt"
file_path.write_text("test content")
metadata = extract_metadata(
file_path=str(file_path),
chunks=["chunk 1", "chunk 2"],
original_filename="original.txt",
strategy_type="token",
)
assert len(metadata) == 2
for m in metadata:
assert "filename" in m
assert "upload_date" in m
assert "content_summary" in m
assert "chunk_index" in m
assert m.get("strategy_type", "token") == "token"
assert "question_id" not in m
def test_page_number_from_question(tmp_path):
"""Page ref should point to question location (pass via page_numbers from strategy)."""
file_path = tmp_path / "test.pdf"
file_path.write_text("dummy content")
metadata = extract_metadata(
file_path=str(file_path),
chunks=["question chunk"],
page_numbers=[3],
strategy_type="question",
chunk_metadata=[{
"question_id": "A1",
"source_page_range": [3, 8],
}],
)
assert metadata[0]["page_number"] == 3
assert metadata[0]["source_page_range"] == [3, 8]
def test_chunk_metadata_length_mismatch(tmp_path):
"""chunk_metadata length mismatch with chunks raises ValueError."""
file_path = tmp_path / "test.pdf"
file_path.write_text("dummy content")
with pytest.raises(ValueError, match="chunk_metadata length"):
extract_metadata(
file_path=str(file_path),
chunks=["a", "b", "c"],
chunk_metadata=[{}, {}],
)
def test_chunk_metadata_empty_no_error(tmp_path):
"""Empty chunk_metadata list with matching chunks is valid."""
file_path = tmp_path / "test.pdf"
file_path.write_text("dummy content")
metadata = extract_metadata(
file_path=str(file_path),
chunks=["a"],
chunk_metadata=[],
)
assert len(metadata) == 1