482 lines
18 KiB
Python
482 lines
18 KiB
Python
"""Phase 8 tests: Q&A-pair chunking strategy (Sub-Phase 8.1).
|
||
|
||
Covers:
|
||
- LLM structure detection response parsing (parse_llm_structure_response)
|
||
- Mixed format handling (問/答 + section headings)
|
||
- Narrative-only text (no Q&A format)
|
||
- Speaking notes (發言要點) chunking by bullet
|
||
- Regex fast-pass for Chinese 問/答 format
|
||
- Regex fast-pass for English Q1/Q2 format
|
||
- Multi-page section tracking with [PAGE_BREAK] markers
|
||
- ChunkingStrategy ABC compliance
|
||
- Page number references question (問) page, not answer
|
||
- Size limit: oversized sections recursively split with heading preserved
|
||
- build_chunks_from_sections output verification
|
||
- preprocess_text: footer stripping, colon normalization, page break insertion
|
||
"""
|
||
|
||
import json
|
||
from typing import List, Tuple
|
||
from unittest.mock import AsyncMock, MagicMock
|
||
|
||
import pytest
|
||
|
||
from app.utils.qa_chunking import (
|
||
Section,
|
||
preprocess_text,
|
||
build_structure_detection_prompt,
|
||
parse_llm_structure_response,
|
||
split_chinese_qa,
|
||
split_english_qa,
|
||
build_chunks_from_sections,
|
||
)
|
||
from app.utils.chunking import (
|
||
ChunkingStrategy,
|
||
QuestionChunkingStrategy,
|
||
get_chunking_strategy,
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Fixtures
|
||
# ---------------------------------------------------------------------------
|
||
|
||
@pytest.fixture
|
||
def mock_settings():
|
||
"""Minimal Settings mock with Q&A chunking defaults."""
|
||
s = MagicMock()
|
||
s.default_chunking_strategy = "question"
|
||
s.qa_vision_enabled = False
|
||
s.qa_max_chunk_tokens = 3000
|
||
s.qa_structure_model = ""
|
||
s.qa_include_internal_refs = True
|
||
s.qa_cache_vision_results = True
|
||
s.chunk_size = 1000
|
||
s.chunk_overlap = 200
|
||
s.llm_model_name = "test-model"
|
||
s.llm_api_key = "test-key"
|
||
s.llm_base_url = "https://example.com/v1"
|
||
s.llm_timeout = 30.0
|
||
s.llm_enable_thinking = False
|
||
s.vllm_engine = False
|
||
return s
|
||
|
||
|
||
SAMPLE_LLM_RESPONSE = json.dumps({
|
||
"sections": [
|
||
{
|
||
"type": "qa",
|
||
"heading": "(A) 排水系統",
|
||
"qa_id": "A1",
|
||
"question": "古洞北和粉嶺北新發展區的設計是否能抵禦氣候變化?",
|
||
"answer": "研究顧問已為古洞北和粉嶺北新發展區研究範圍進行了評估。",
|
||
"start_page": 2,
|
||
"end_page": 3,
|
||
"has_table": False,
|
||
"parent_topic": "排水系統",
|
||
},
|
||
{
|
||
"type": "narrative",
|
||
"heading": "(1) 住戶的安置補償",
|
||
"content": "合資格住戶可選擇(i)「須通過經濟狀況審查」安置安排。",
|
||
"start_page": 2,
|
||
"end_page": 5,
|
||
"has_table": False,
|
||
},
|
||
{
|
||
"type": "speaking_notes",
|
||
"heading": "發言要點",
|
||
"content": "⚫ 古洞北/粉嶺北新發展區是北部都會區內的新市鎮發展項目\n⚫ 第一期發展預計於2030年完成",
|
||
"start_page": 1,
|
||
"end_page": 2,
|
||
"has_table": False,
|
||
},
|
||
]
|
||
})
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test: LLM structure detection parsing
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestLLMStructureDetection:
|
||
|
||
def test_llm_structure_detection(self):
|
||
"""parse_llm_structure_response correctly parses LLM JSON with qa/narrative/speaking_notes."""
|
||
sections = parse_llm_structure_response(SAMPLE_LLM_RESPONSE)
|
||
assert len(sections) == 3
|
||
|
||
qa = sections[0]
|
||
assert qa.type == "qa"
|
||
assert qa.qa_id == "A1"
|
||
assert qa.question == "古洞北和粉嶺北新發展區的設計是否能抵禦氣候變化?"
|
||
assert qa.answer == "研究顧問已為古洞北和粉嶺北新發展區研究範圍進行了評估。"
|
||
assert qa.start_page == 2
|
||
assert qa.end_page == 3
|
||
assert qa.heading == "(A) 排水系統"
|
||
assert qa.parent_topic == "排水系統"
|
||
|
||
narr = sections[1]
|
||
assert narr.type == "narrative"
|
||
assert narr.heading == "(1) 住戶的安置補償"
|
||
assert "合資格住戶" in narr.content
|
||
|
||
notes = sections[2]
|
||
assert notes.type == "speaking_notes"
|
||
assert "⚫" in notes.content
|
||
|
||
def test_llm_handles_mixed_formats(self):
|
||
"""Document with 問/答 markers + section headings correctly classified."""
|
||
mixed_json = json.dumps({
|
||
"sections": [
|
||
{
|
||
"type": "qa",
|
||
"heading": "(B) 交通",
|
||
"qa_id": "B1",
|
||
"question": "新建道路何時通車?",
|
||
"answer": "預計2027年通車。",
|
||
"start_page": 3,
|
||
"end_page": 4,
|
||
"has_table": False,
|
||
},
|
||
{
|
||
"type": "narrative",
|
||
"heading": "背景",
|
||
"content": "本文件說明交通規劃。",
|
||
"start_page": 1,
|
||
"end_page": 2,
|
||
"has_table": False,
|
||
},
|
||
]
|
||
})
|
||
sections = parse_llm_structure_response(mixed_json)
|
||
assert len(sections) == 2
|
||
assert sections[0].type == "qa"
|
||
assert sections[1].type == "narrative"
|
||
|
||
def test_llm_handles_no_qa_format(self):
|
||
"""Narrative-only text (like File L pages 1-13) produces only narrative sections."""
|
||
narrative_json = json.dumps({
|
||
"sections": [
|
||
{
|
||
"type": "narrative",
|
||
"heading": "Introduction",
|
||
"content": "This document provides background on policy matters.",
|
||
"start_page": 1,
|
||
"end_page": 5,
|
||
"has_table": False,
|
||
},
|
||
{
|
||
"type": "narrative",
|
||
"heading": "Analysis",
|
||
"content": "The analysis covers multiple dimensions.",
|
||
"start_page": 5,
|
||
"end_page": 13,
|
||
"has_table": False,
|
||
},
|
||
]
|
||
})
|
||
sections = parse_llm_structure_response(narrative_json)
|
||
assert len(sections) == 2
|
||
assert all(s.type == "narrative" for s in sections)
|
||
|
||
def test_llm_handles_speaking_notes(self):
|
||
"""發言要點 text with bullet points produces speaking_notes sections."""
|
||
notes_json = json.dumps({
|
||
"sections": [
|
||
{
|
||
"type": "speaking_notes",
|
||
"heading": "發言要點",
|
||
"content": "⚫ 要點一:政策方向\n⚫ 要點二:實施計劃\n⚫ 要點三:預算安排",
|
||
"start_page": 1,
|
||
"end_page": 2,
|
||
"has_table": False,
|
||
},
|
||
]
|
||
})
|
||
sections = parse_llm_structure_response(notes_json)
|
||
assert len(sections) == 1
|
||
assert sections[0].type == "speaking_notes"
|
||
assert sections[0].content.count("⚫") == 3
|
||
|
||
def test_parse_markdown_fenced_json(self):
|
||
"""parse_llm_structure_response handles ```json ... ``` wrapped responses."""
|
||
fenced = '```json\n' + SAMPLE_LLM_RESPONSE + '\n```'
|
||
sections = parse_llm_structure_response(fenced)
|
||
assert len(sections) == 3
|
||
|
||
def test_parse_invalid_json_raises(self):
|
||
"""parse_llm_structure_response raises ValueError on non-JSON input."""
|
||
with pytest.raises(ValueError, match="Invalid JSON"):
|
||
parse_llm_structure_response("this is not json")
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test: Regex fast-pass
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestRegexFastPass:
|
||
|
||
def test_regex_fastpass_chinese(self):
|
||
"""Text with 問B1/答B1 markers detected by split_chinese_qa without LLM."""
|
||
text = (
|
||
"(A) 排水系統\n"
|
||
"問 B1:古洞北的設計是否能抵禦氣候變化?\n"
|
||
"答 B1:研究顧問已為古洞北新發展區進行了評估。\n"
|
||
"問 B2:第二個問題是什麼?\n"
|
||
"答 B2:這是第二個問題的答案。\n"
|
||
)
|
||
sections = split_chinese_qa(text)
|
||
assert len(sections) >= 2
|
||
# All should be QA type
|
||
assert all(s.type == "qa" for s in sections)
|
||
# First should have question containing 古洞北
|
||
assert "古洞北" in sections[0].question
|
||
|
||
def test_regex_fastpass_chinese_no_match(self):
|
||
"""split_chinese_qa returns empty list when no markers found."""
|
||
text = "This is plain text without any Q&A markers."
|
||
assert split_chinese_qa(text) == []
|
||
|
||
def test_regex_fastpass_english(self):
|
||
"""Text with Q1, Q2 markers detected by split_english_qa without LLM."""
|
||
text = (
|
||
"Background information here.\n\n"
|
||
"Q1 What is the timeline for the project?\n"
|
||
"The project is expected to complete by 2027.\n"
|
||
"Q2 How much will it cost?\n"
|
||
"The estimated cost is HK$500 million.\n"
|
||
)
|
||
sections = split_english_qa(text)
|
||
assert len(sections) >= 2
|
||
assert all(s.type == "qa" for s in sections)
|
||
assert any("timeline" in (s.question or "").lower() for s in sections)
|
||
|
||
def test_regex_fastpass_english_no_match(self):
|
||
"""split_english_qa returns empty list when no markers found."""
|
||
text = "純中文文本沒有英文問答標記。"
|
||
assert split_english_qa(text) == []
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test: Multi-page tracking
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestMultiPage:
|
||
|
||
def test_multi_page_sections(self):
|
||
"""Sections with [PAGE_BREAK: N] markers spanning pages track correctly."""
|
||
pages = [
|
||
(1, "Header line\n(A) Water drainage\nSome intro text"),
|
||
(2, "More drainage info\nFooter text X-1"),
|
||
(3, "New section begins\n(B) Traffic planning"),
|
||
]
|
||
text = preprocess_text(pages)
|
||
# Should have page break markers
|
||
assert "[PAGE_BREAK: 1]" in text
|
||
assert "[PAGE_BREAK: 2]" in text
|
||
assert "[PAGE_BREAK: 3]" in text
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test: ABC contract
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestABCContract:
|
||
|
||
def test_abc_contract(self):
|
||
"""QuestionChunkingStrategy satisfies ChunkingStrategy ABC."""
|
||
mock_settings = MagicMock()
|
||
mock_settings.qa_max_chunk_tokens = 3000
|
||
mock_settings.qa_include_internal_refs = True
|
||
strategy = QuestionChunkingStrategy(settings=mock_settings)
|
||
assert isinstance(strategy, ChunkingStrategy)
|
||
|
||
def test_get_chunking_strategy_factory(self, mock_settings):
|
||
"""get_chunking_strategy returns correct strategy type."""
|
||
token_strat = get_chunking_strategy("token", mock_settings)
|
||
assert isinstance(token_strat, ChunkingStrategy)
|
||
|
||
q_strat = get_chunking_strategy("question", mock_settings)
|
||
assert isinstance(q_strat, QuestionChunkingStrategy)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test: Page number reference
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestPageNumberReference:
|
||
|
||
def test_page_number_reference_question(self):
|
||
"""Page ref in metadata points to question (問) page, not answer page."""
|
||
sections = [
|
||
Section(
|
||
type="qa",
|
||
heading="(A) Topic",
|
||
qa_id="A1",
|
||
question="What is X?",
|
||
answer="X is Y.",
|
||
start_page=5,
|
||
end_page=8,
|
||
),
|
||
]
|
||
chunks = build_chunks_from_sections(sections)
|
||
assert len(chunks) == 1
|
||
chunk_text, page_num, metadata = chunks[0]
|
||
# Page number should be start_page (question location)
|
||
assert page_num == 5
|
||
assert metadata.get("source_page_range") == [5, 8]
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test: Size limit recursive split
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestSizeLimit:
|
||
|
||
def test_size_limit(self):
|
||
"""Oversized QA section > 3000 tokens gets recursively split with question prepended."""
|
||
# Create a QA pair with a very long answer
|
||
long_answer = "\n\n".join(f"Paragraph {i}: " + "x" * 200 for i in range(80))
|
||
sections = [
|
||
Section(
|
||
type="qa",
|
||
heading="(A) Topic",
|
||
qa_id="A1",
|
||
question="What is the detailed plan?",
|
||
answer=long_answer,
|
||
start_page=2,
|
||
end_page=5,
|
||
has_table=False,
|
||
),
|
||
]
|
||
# Use a small max_tokens to force splitting
|
||
chunks = build_chunks_from_sections(sections, max_tokens=500)
|
||
assert len(chunks) > 1
|
||
# Each chunk should have the question text prepended
|
||
for chunk_text, page_num, metadata in chunks:
|
||
assert "What is the detailed plan?" in chunk_text
|
||
# Page number should always be the question page
|
||
assert page_num == 2
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test: build_chunks_from_sections
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestBuildChunksFromSections:
|
||
|
||
def test_build_chunks_from_sections(self):
|
||
"""Verify chunk texts and metadata from sections list."""
|
||
sections = [
|
||
Section(
|
||
type="qa",
|
||
heading="(A) 排水系統",
|
||
qa_id="A1",
|
||
question="古洞北的設計是否能抵禦氣候變化?",
|
||
answer="研究顧問已為古洞北進行了評估。",
|
||
start_page=2,
|
||
end_page=3,
|
||
has_table=True,
|
||
parent_topic="排水系統",
|
||
),
|
||
Section(
|
||
type="narrative",
|
||
heading="(1) 住戶的安置補償",
|
||
content="合資格住戶可選擇安置安排。",
|
||
start_page=3,
|
||
end_page=5,
|
||
has_table=False,
|
||
),
|
||
Section(
|
||
type="speaking_notes",
|
||
heading="發言要點",
|
||
content="⚫ 要點一:政策方向\n⚫ 要點二:實施計劃",
|
||
start_page=1,
|
||
end_page=1,
|
||
has_table=False,
|
||
),
|
||
Section(
|
||
type="toc",
|
||
heading="目錄",
|
||
content="Page 1 ... Page 2",
|
||
start_page=1,
|
||
end_page=1,
|
||
has_table=False,
|
||
),
|
||
]
|
||
chunks = build_chunks_from_sections(sections)
|
||
# Should have: 1 QA + 1 narrative + 2 speaking_notes bullets + 0 toc = 4
|
||
assert len(chunks) >= 4
|
||
|
||
# First chunk: QA
|
||
qa_text, qa_page, qa_meta = chunks[0]
|
||
assert "古洞北" in qa_text
|
||
assert qa_page == 2
|
||
assert qa_meta["section_type"] == "qa"
|
||
assert qa_meta["question_id"] == "A1"
|
||
assert qa_meta["question_index"] == 0
|
||
assert qa_meta["answer_contains_table"] is True
|
||
assert qa_meta["section_heading"] == "(A) 排水系統"
|
||
|
||
# Find the narrative chunk
|
||
narr_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "narrative"]
|
||
assert len(narr_chunks) == 1
|
||
narr_text, narr_page, narr_meta = narr_chunks[0]
|
||
assert "住戶的安置補償" in narr_text
|
||
assert "合資格住戶" in narr_text
|
||
|
||
# Find speaking_notes chunks
|
||
notes_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "speaking_notes"]
|
||
assert len(notes_chunks) == 2
|
||
for t, p, m in notes_chunks:
|
||
assert "要點" in t
|
||
|
||
# No TOC chunks
|
||
toc_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "toc"]
|
||
assert len(toc_chunks) == 0
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test: preprocess_text
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestPreprocessText:
|
||
|
||
def test_preprocess_text(self):
|
||
"""Footer markers stripped, colons normalized, page breaks inserted."""
|
||
pages = [
|
||
(1, "Header\n(A) Section Title\nX-1\n2024-01-15"),
|
||
(2, "Content with:fullwidth colon\nMore text:here"),
|
||
]
|
||
result = preprocess_text(pages)
|
||
|
||
# Should have page break markers
|
||
assert "[PAGE_BREAK: 1]" in result
|
||
assert "[PAGE_BREAK: 2]" in result
|
||
|
||
# Fullwidth colons normalized to ASCII
|
||
assert ":" not in result
|
||
assert ":" in result
|
||
|
||
# Page footer patterns should be stripped (X-1, dates like 2024-01-15)
|
||
assert "X-1" not in result
|
||
assert "2024-01-15" not in result
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test: build_structure_detection_prompt
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestBuildPrompt:
|
||
|
||
def test_build_structure_detection_prompt(self):
|
||
"""Prompt contains key instructions for LLM classification."""
|
||
text = "Sample document text [PAGE_BREAK: 1]"
|
||
prompt = build_structure_detection_prompt(text)
|
||
assert "Hong Kong Legislative Council" in prompt or "Legislative Council" in prompt
|
||
assert "qa" in prompt.lower() or "問" in prompt
|
||
assert "narrative" in prompt.lower()
|
||
assert "speaking_notes" in prompt.lower() or "speaking notes" in prompt.lower()
|
||
assert text in prompt
|