legco_ai_assistant/backend/app/test/test_phase8_qa_chunking.py

482 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Phase 8 tests: Q&A-pair chunking strategy (Sub-Phase 8.1).
Covers:
- LLM structure detection response parsing (parse_llm_structure_response)
- Mixed format handling (問/答 + section headings)
- Narrative-only text (no Q&A format)
- Speaking notes (發言要點) chunking by bullet
- Regex fast-pass for Chinese 問/答 format
- Regex fast-pass for English Q1/Q2 format
- Multi-page section tracking with [PAGE_BREAK] markers
- ChunkingStrategy ABC compliance
- Page number references question (問) page, not answer
- Size limit: oversized sections recursively split with heading preserved
- build_chunks_from_sections output verification
- preprocess_text: footer stripping, colon normalization, page break insertion
"""
import json
from typing import List, Tuple
from unittest.mock import AsyncMock, MagicMock
import pytest
from app.utils.qa_chunking import (
Section,
preprocess_text,
build_structure_detection_prompt,
parse_llm_structure_response,
split_chinese_qa,
split_english_qa,
build_chunks_from_sections,
)
from app.utils.chunking import (
ChunkingStrategy,
QuestionChunkingStrategy,
get_chunking_strategy,
)
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture
def mock_settings():
"""Minimal Settings mock with Q&A chunking defaults."""
s = MagicMock()
s.default_chunking_strategy = "question"
s.qa_vision_enabled = False
s.qa_max_chunk_tokens = 3000
s.qa_structure_model = ""
s.qa_include_internal_refs = True
s.qa_cache_vision_results = True
s.chunk_size = 1000
s.chunk_overlap = 200
s.llm_model_name = "test-model"
s.llm_api_key = "test-key"
s.llm_base_url = "https://example.com/v1"
s.llm_timeout = 30.0
s.llm_enable_thinking = False
s.vllm_engine = False
return s
SAMPLE_LLM_RESPONSE = json.dumps({
"sections": [
{
"type": "qa",
"heading": "(A) 排水系統",
"qa_id": "A1",
"question": "古洞北和粉嶺北新發展區的設計是否能抵禦氣候變化?",
"answer": "研究顧問已為古洞北和粉嶺北新發展區研究範圍進行了評估。",
"start_page": 2,
"end_page": 3,
"has_table": False,
"parent_topic": "排水系統",
},
{
"type": "narrative",
"heading": "(1) 住戶的安置補償",
"content": "合資格住戶可選擇(i)「須通過經濟狀況審查」安置安排。",
"start_page": 2,
"end_page": 5,
"has_table": False,
},
{
"type": "speaking_notes",
"heading": "發言要點",
"content": "⚫ 古洞北/粉嶺北新發展區是北部都會區內的新市鎮發展項目\n⚫ 第一期發展預計於2030年完成",
"start_page": 1,
"end_page": 2,
"has_table": False,
},
]
})
# ---------------------------------------------------------------------------
# Test: LLM structure detection parsing
# ---------------------------------------------------------------------------
class TestLLMStructureDetection:
def test_llm_structure_detection(self):
"""parse_llm_structure_response correctly parses LLM JSON with qa/narrative/speaking_notes."""
sections = parse_llm_structure_response(SAMPLE_LLM_RESPONSE)
assert len(sections) == 3
qa = sections[0]
assert qa.type == "qa"
assert qa.qa_id == "A1"
assert qa.question == "古洞北和粉嶺北新發展區的設計是否能抵禦氣候變化?"
assert qa.answer == "研究顧問已為古洞北和粉嶺北新發展區研究範圍進行了評估。"
assert qa.start_page == 2
assert qa.end_page == 3
assert qa.heading == "(A) 排水系統"
assert qa.parent_topic == "排水系統"
narr = sections[1]
assert narr.type == "narrative"
assert narr.heading == "(1) 住戶的安置補償"
assert "合資格住戶" in narr.content
notes = sections[2]
assert notes.type == "speaking_notes"
assert "" in notes.content
def test_llm_handles_mixed_formats(self):
"""Document with 問/答 markers + section headings correctly classified."""
mixed_json = json.dumps({
"sections": [
{
"type": "qa",
"heading": "(B) 交通",
"qa_id": "B1",
"question": "新建道路何時通車?",
"answer": "預計2027年通車。",
"start_page": 3,
"end_page": 4,
"has_table": False,
},
{
"type": "narrative",
"heading": "背景",
"content": "本文件說明交通規劃。",
"start_page": 1,
"end_page": 2,
"has_table": False,
},
]
})
sections = parse_llm_structure_response(mixed_json)
assert len(sections) == 2
assert sections[0].type == "qa"
assert sections[1].type == "narrative"
def test_llm_handles_no_qa_format(self):
"""Narrative-only text (like File L pages 1-13) produces only narrative sections."""
narrative_json = json.dumps({
"sections": [
{
"type": "narrative",
"heading": "Introduction",
"content": "This document provides background on policy matters.",
"start_page": 1,
"end_page": 5,
"has_table": False,
},
{
"type": "narrative",
"heading": "Analysis",
"content": "The analysis covers multiple dimensions.",
"start_page": 5,
"end_page": 13,
"has_table": False,
},
]
})
sections = parse_llm_structure_response(narrative_json)
assert len(sections) == 2
assert all(s.type == "narrative" for s in sections)
def test_llm_handles_speaking_notes(self):
"""發言要點 text with bullet points produces speaking_notes sections."""
notes_json = json.dumps({
"sections": [
{
"type": "speaking_notes",
"heading": "發言要點",
"content": "⚫ 要點一:政策方向\n⚫ 要點二:實施計劃\n⚫ 要點三:預算安排",
"start_page": 1,
"end_page": 2,
"has_table": False,
},
]
})
sections = parse_llm_structure_response(notes_json)
assert len(sections) == 1
assert sections[0].type == "speaking_notes"
assert sections[0].content.count("") == 3
def test_parse_markdown_fenced_json(self):
"""parse_llm_structure_response handles ```json ... ``` wrapped responses."""
fenced = '```json\n' + SAMPLE_LLM_RESPONSE + '\n```'
sections = parse_llm_structure_response(fenced)
assert len(sections) == 3
def test_parse_invalid_json_raises(self):
"""parse_llm_structure_response raises ValueError on non-JSON input."""
with pytest.raises(ValueError, match="Invalid JSON"):
parse_llm_structure_response("this is not json")
# ---------------------------------------------------------------------------
# Test: Regex fast-pass
# ---------------------------------------------------------------------------
class TestRegexFastPass:
def test_regex_fastpass_chinese(self):
"""Text with 問B1/答B1 markers detected by split_chinese_qa without LLM."""
text = (
"(A) 排水系統\n"
"問 B1古洞北的設計是否能抵禦氣候變化\n"
"答 B1研究顧問已為古洞北新發展區進行了評估。\n"
"問 B2第二個問題是什麼\n"
"答 B2這是第二個問題的答案。\n"
)
sections = split_chinese_qa(text)
assert len(sections) >= 2
# All should be QA type
assert all(s.type == "qa" for s in sections)
# First should have question containing 古洞北
assert "古洞北" in sections[0].question
def test_regex_fastpass_chinese_no_match(self):
"""split_chinese_qa returns empty list when no markers found."""
text = "This is plain text without any Q&A markers."
assert split_chinese_qa(text) == []
def test_regex_fastpass_english(self):
"""Text with Q1, Q2 markers detected by split_english_qa without LLM."""
text = (
"Background information here.\n\n"
"Q1 What is the timeline for the project?\n"
"The project is expected to complete by 2027.\n"
"Q2 How much will it cost?\n"
"The estimated cost is HK$500 million.\n"
)
sections = split_english_qa(text)
assert len(sections) >= 2
assert all(s.type == "qa" for s in sections)
assert any("timeline" in (s.question or "").lower() for s in sections)
def test_regex_fastpass_english_no_match(self):
"""split_english_qa returns empty list when no markers found."""
text = "純中文文本沒有英文問答標記。"
assert split_english_qa(text) == []
# ---------------------------------------------------------------------------
# Test: Multi-page tracking
# ---------------------------------------------------------------------------
class TestMultiPage:
def test_multi_page_sections(self):
"""Sections with [PAGE_BREAK: N] markers spanning pages track correctly."""
pages = [
(1, "Header line\n(A) Water drainage\nSome intro text"),
(2, "More drainage info\nFooter text X-1"),
(3, "New section begins\n(B) Traffic planning"),
]
text = preprocess_text(pages)
# Should have page break markers
assert "[PAGE_BREAK: 1]" in text
assert "[PAGE_BREAK: 2]" in text
assert "[PAGE_BREAK: 3]" in text
# ---------------------------------------------------------------------------
# Test: ABC contract
# ---------------------------------------------------------------------------
class TestABCContract:
def test_abc_contract(self):
"""QuestionChunkingStrategy satisfies ChunkingStrategy ABC."""
mock_settings = MagicMock()
mock_settings.qa_max_chunk_tokens = 3000
mock_settings.qa_include_internal_refs = True
strategy = QuestionChunkingStrategy(settings=mock_settings)
assert isinstance(strategy, ChunkingStrategy)
def test_get_chunking_strategy_factory(self, mock_settings):
"""get_chunking_strategy returns correct strategy type."""
token_strat = get_chunking_strategy("token", mock_settings)
assert isinstance(token_strat, ChunkingStrategy)
q_strat = get_chunking_strategy("question", mock_settings)
assert isinstance(q_strat, QuestionChunkingStrategy)
# ---------------------------------------------------------------------------
# Test: Page number reference
# ---------------------------------------------------------------------------
class TestPageNumberReference:
def test_page_number_reference_question(self):
"""Page ref in metadata points to question (問) page, not answer page."""
sections = [
Section(
type="qa",
heading="(A) Topic",
qa_id="A1",
question="What is X?",
answer="X is Y.",
start_page=5,
end_page=8,
),
]
chunks = build_chunks_from_sections(sections)
assert len(chunks) == 1
chunk_text, page_num, metadata = chunks[0]
# Page number should be start_page (question location)
assert page_num == 5
assert metadata.get("source_page_range") == [5, 8]
# ---------------------------------------------------------------------------
# Test: Size limit recursive split
# ---------------------------------------------------------------------------
class TestSizeLimit:
def test_size_limit(self):
"""Oversized QA section > 3000 tokens gets recursively split with question prepended."""
# Create a QA pair with a very long answer
long_answer = "\n\n".join(f"Paragraph {i}: " + "x" * 200 for i in range(80))
sections = [
Section(
type="qa",
heading="(A) Topic",
qa_id="A1",
question="What is the detailed plan?",
answer=long_answer,
start_page=2,
end_page=5,
has_table=False,
),
]
# Use a small max_tokens to force splitting
chunks = build_chunks_from_sections(sections, max_tokens=500)
assert len(chunks) > 1
# Each chunk should have the question text prepended
for chunk_text, page_num, metadata in chunks:
assert "What is the detailed plan?" in chunk_text
# Page number should always be the question page
assert page_num == 2
# ---------------------------------------------------------------------------
# Test: build_chunks_from_sections
# ---------------------------------------------------------------------------
class TestBuildChunksFromSections:
def test_build_chunks_from_sections(self):
"""Verify chunk texts and metadata from sections list."""
sections = [
Section(
type="qa",
heading="(A) 排水系統",
qa_id="A1",
question="古洞北的設計是否能抵禦氣候變化?",
answer="研究顧問已為古洞北進行了評估。",
start_page=2,
end_page=3,
has_table=True,
parent_topic="排水系統",
),
Section(
type="narrative",
heading="(1) 住戶的安置補償",
content="合資格住戶可選擇安置安排。",
start_page=3,
end_page=5,
has_table=False,
),
Section(
type="speaking_notes",
heading="發言要點",
content="⚫ 要點一:政策方向\n⚫ 要點二:實施計劃",
start_page=1,
end_page=1,
has_table=False,
),
Section(
type="toc",
heading="目錄",
content="Page 1 ... Page 2",
start_page=1,
end_page=1,
has_table=False,
),
]
chunks = build_chunks_from_sections(sections)
# Should have: 1 QA + 1 narrative + 2 speaking_notes bullets + 0 toc = 4
assert len(chunks) >= 4
# First chunk: QA
qa_text, qa_page, qa_meta = chunks[0]
assert "古洞北" in qa_text
assert qa_page == 2
assert qa_meta["section_type"] == "qa"
assert qa_meta["question_id"] == "A1"
assert qa_meta["question_index"] == 0
assert qa_meta["answer_contains_table"] is True
assert qa_meta["section_heading"] == "(A) 排水系統"
# Find the narrative chunk
narr_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "narrative"]
assert len(narr_chunks) == 1
narr_text, narr_page, narr_meta = narr_chunks[0]
assert "住戶的安置補償" in narr_text
assert "合資格住戶" in narr_text
# Find speaking_notes chunks
notes_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "speaking_notes"]
assert len(notes_chunks) == 2
for t, p, m in notes_chunks:
assert "要點" in t
# No TOC chunks
toc_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "toc"]
assert len(toc_chunks) == 0
# ---------------------------------------------------------------------------
# Test: preprocess_text
# ---------------------------------------------------------------------------
class TestPreprocessText:
def test_preprocess_text(self):
"""Footer markers stripped, colons normalized, page breaks inserted."""
pages = [
(1, "Header\n(A) Section Title\nX-1\n2024-01-15"),
(2, "Content withfullwidth colon\nMore texthere"),
]
result = preprocess_text(pages)
# Should have page break markers
assert "[PAGE_BREAK: 1]" in result
assert "[PAGE_BREAK: 2]" in result
# Fullwidth colons normalized to ASCII
assert "" not in result
assert ":" in result
# Page footer patterns should be stripped (X-1, dates like 2024-01-15)
assert "X-1" not in result
assert "2024-01-15" not in result
# ---------------------------------------------------------------------------
# Test: build_structure_detection_prompt
# ---------------------------------------------------------------------------
class TestBuildPrompt:
def test_build_structure_detection_prompt(self):
"""Prompt contains key instructions for LLM classification."""
text = "Sample document text [PAGE_BREAK: 1]"
prompt = build_structure_detection_prompt(text)
assert "Hong Kong Legislative Council" in prompt or "Legislative Council" in prompt
assert "qa" in prompt.lower() or "" in prompt
assert "narrative" in prompt.lower()
assert "speaking_notes" in prompt.lower() or "speaking notes" in prompt.lower()
assert text in prompt