150 lines
4.6 KiB
Python
150 lines
4.6 KiB
Python
"""Phase 8 tests: Q&A metadata enrichment (Sub-Phase 8.2).
|
|
|
|
Covers:
|
|
- Metadata enrichment with Q&A-specific fields via chunk_metadata param
|
|
- Backward compatibility: token strategy unchanged
|
|
- Page number references question location
|
|
- Chunk metadata merging with base metadata
|
|
"""
|
|
import json
|
|
|
|
import pytest
|
|
|
|
from app.utils.metadata import extract_metadata
|
|
|
|
|
|
def test_qa_metadata_fields(tmp_path):
|
|
"""strategy_type, question_index, question_id, question_text merged via chunk_metadata."""
|
|
file_path = tmp_path / "test.pdf"
|
|
file_path.write_text("dummy content")
|
|
|
|
chunks = ["chunk 1", "chunk 2"]
|
|
chunk_metadata = [
|
|
{
|
|
"strategy_type": "question",
|
|
"section_type": "qa",
|
|
"question_index": 0,
|
|
"question_id": "A1",
|
|
"question_text": "What is X?",
|
|
"section_heading": "(A) Section",
|
|
"answer_contains_table": True,
|
|
"source_page_range": [2, 5],
|
|
"parent_topic": "Topic Name",
|
|
},
|
|
{
|
|
"strategy_type": "question",
|
|
"section_type": "qa",
|
|
"question_index": 1,
|
|
"question_id": "A2",
|
|
"question_text": "What is Y?",
|
|
"section_heading": "(A) Section",
|
|
"answer_contains_table": False,
|
|
"source_page_range": [5, 7],
|
|
},
|
|
]
|
|
|
|
metadata = extract_metadata(
|
|
file_path=str(file_path),
|
|
chunks=chunks,
|
|
strategy_type="question",
|
|
chunk_metadata=chunk_metadata,
|
|
)
|
|
assert len(metadata) == 2
|
|
|
|
m0 = metadata[0]
|
|
assert m0["strategy_type"] == "question"
|
|
assert m0["section_type"] == "qa"
|
|
assert m0["question_index"] == 0
|
|
assert m0["question_id"] == "A1"
|
|
assert m0["question_text"] == "What is X?"
|
|
assert m0["section_heading"] == "(A) Section"
|
|
assert m0["answer_contains_table"] is True
|
|
assert m0["source_page_range"] == [2, 5]
|
|
assert m0["parent_topic"] == "Topic Name"
|
|
|
|
m1 = metadata[1]
|
|
assert m1["question_index"] == 1
|
|
assert m1["question_id"] == "A2"
|
|
assert m1["answer_contains_table"] is False
|
|
|
|
|
|
def test_qa_metadata_topic_section(tmp_path):
|
|
"""section_heading and parent_topic are both preserved."""
|
|
file_path = tmp_path / "test.pdf"
|
|
file_path.write_text("dummy content")
|
|
|
|
metadata = extract_metadata(
|
|
file_path=str(file_path),
|
|
chunks=["chunk"],
|
|
strategy_type="question",
|
|
chunk_metadata=[{"section_heading": "(B) Traffic", "parent_topic": "Traffic Planning"}],
|
|
)
|
|
assert metadata[0]["section_heading"] == "(B) Traffic"
|
|
assert metadata[0]["parent_topic"] == "Traffic Planning"
|
|
|
|
|
|
def test_token_metadata_unchanged(tmp_path):
|
|
"""Existing metadata fields unchanged for token strategy (no chunk_metadata)."""
|
|
file_path = tmp_path / "test.txt"
|
|
file_path.write_text("test content")
|
|
|
|
metadata = extract_metadata(
|
|
file_path=str(file_path),
|
|
chunks=["chunk 1", "chunk 2"],
|
|
original_filename="original.txt",
|
|
strategy_type="token",
|
|
)
|
|
assert len(metadata) == 2
|
|
for m in metadata:
|
|
assert "filename" in m
|
|
assert "upload_date" in m
|
|
assert "content_summary" in m
|
|
assert "chunk_index" in m
|
|
assert m.get("strategy_type", "token") == "token"
|
|
assert "question_id" not in m
|
|
|
|
|
|
def test_page_number_from_question(tmp_path):
|
|
"""Page ref should point to question location (pass via page_numbers from strategy)."""
|
|
file_path = tmp_path / "test.pdf"
|
|
file_path.write_text("dummy content")
|
|
|
|
metadata = extract_metadata(
|
|
file_path=str(file_path),
|
|
chunks=["question chunk"],
|
|
page_numbers=[3],
|
|
strategy_type="question",
|
|
chunk_metadata=[{
|
|
"question_id": "A1",
|
|
"source_page_range": [3, 8],
|
|
}],
|
|
)
|
|
assert metadata[0]["page_number"] == 3
|
|
assert metadata[0]["source_page_range"] == [3, 8]
|
|
|
|
|
|
def test_chunk_metadata_length_mismatch(tmp_path):
|
|
"""chunk_metadata length mismatch with chunks raises ValueError."""
|
|
file_path = tmp_path / "test.pdf"
|
|
file_path.write_text("dummy content")
|
|
|
|
with pytest.raises(ValueError, match="chunk_metadata length"):
|
|
extract_metadata(
|
|
file_path=str(file_path),
|
|
chunks=["a", "b", "c"],
|
|
chunk_metadata=[{}, {}],
|
|
)
|
|
|
|
|
|
def test_chunk_metadata_empty_no_error(tmp_path):
|
|
"""Empty chunk_metadata list with matching chunks is valid."""
|
|
file_path = tmp_path / "test.pdf"
|
|
file_path.write_text("dummy content")
|
|
|
|
metadata = extract_metadata(
|
|
file_path=str(file_path),
|
|
chunks=["a"],
|
|
chunk_metadata=[],
|
|
)
|
|
assert len(metadata) == 1
|