"""Phase 8 tests: Q&A-pair chunking strategy (Sub-Phase 8.1). Covers: - LLM structure detection response parsing (parse_llm_structure_response) - Mixed format handling (問/答 + section headings) - Narrative-only text (no Q&A format) - Speaking notes (發言要點) chunking by bullet - Regex fast-pass for Chinese 問/答 format - Regex fast-pass for English Q1/Q2 format - Multi-page section tracking with [PAGE_BREAK] markers - ChunkingStrategy ABC compliance - Page number references question (問) page, not answer - Size limit: oversized sections recursively split with heading preserved - build_chunks_from_sections output verification - preprocess_text: footer stripping, colon normalization, page break insertion """ import json from typing import List, Tuple from unittest.mock import AsyncMock, MagicMock import pytest from app.utils.qa_chunking import ( Section, preprocess_text, build_structure_detection_prompt, parse_llm_structure_response, split_chinese_qa, split_english_qa, build_chunks_from_sections, ) from app.utils.chunking import ( ChunkingStrategy, QuestionChunkingStrategy, get_chunking_strategy, ) # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- @pytest.fixture def mock_settings(): """Minimal Settings mock with Q&A chunking defaults.""" s = MagicMock() s.default_chunking_strategy = "question" s.qa_vision_enabled = False s.qa_max_chunk_tokens = 3000 s.qa_structure_model = "" s.qa_include_internal_refs = True s.qa_cache_vision_results = True s.chunk_size = 1000 s.chunk_overlap = 200 s.llm_model_name = "test-model" s.llm_api_key = "test-key" s.llm_base_url = "https://example.com/v1" s.llm_timeout = 30.0 s.llm_enable_thinking = False s.vllm_engine = False return s SAMPLE_LLM_RESPONSE = json.dumps({ "sections": [ { "type": "qa", "heading": "(A) 排水系統", "qa_id": "A1", "question": "古洞北和粉嶺北新發展區的設計是否能抵禦氣候變化?", "answer": "研究顧問已為古洞北和粉嶺北新發展區研究範圍進行了評估。", "start_page": 2, "end_page": 3, "has_table": False, "parent_topic": "排水系統", }, { "type": "narrative", "heading": "(1) 住戶的安置補償", "content": "合資格住戶可選擇(i)「須通過經濟狀況審查」安置安排。", "start_page": 2, "end_page": 5, "has_table": False, }, { "type": "speaking_notes", "heading": "發言要點", "content": "⚫ 古洞北/粉嶺北新發展區是北部都會區內的新市鎮發展項目\n⚫ 第一期發展預計於2030年完成", "start_page": 1, "end_page": 2, "has_table": False, }, ] }) # --------------------------------------------------------------------------- # Test: LLM structure detection parsing # --------------------------------------------------------------------------- class TestLLMStructureDetection: def test_llm_structure_detection(self): """parse_llm_structure_response correctly parses LLM JSON with qa/narrative/speaking_notes.""" sections = parse_llm_structure_response(SAMPLE_LLM_RESPONSE) assert len(sections) == 3 qa = sections[0] assert qa.type == "qa" assert qa.qa_id == "A1" assert qa.question == "古洞北和粉嶺北新發展區的設計是否能抵禦氣候變化?" assert qa.answer == "研究顧問已為古洞北和粉嶺北新發展區研究範圍進行了評估。" assert qa.start_page == 2 assert qa.end_page == 3 assert qa.heading == "(A) 排水系統" assert qa.parent_topic == "排水系統" narr = sections[1] assert narr.type == "narrative" assert narr.heading == "(1) 住戶的安置補償" assert "合資格住戶" in narr.content notes = sections[2] assert notes.type == "speaking_notes" assert "⚫" in notes.content def test_llm_handles_mixed_formats(self): """Document with 問/答 markers + section headings correctly classified.""" mixed_json = json.dumps({ "sections": [ { "type": "qa", "heading": "(B) 交通", "qa_id": "B1", "question": "新建道路何時通車?", "answer": "預計2027年通車。", "start_page": 3, "end_page": 4, "has_table": False, }, { "type": "narrative", "heading": "背景", "content": "本文件說明交通規劃。", "start_page": 1, "end_page": 2, "has_table": False, }, ] }) sections = parse_llm_structure_response(mixed_json) assert len(sections) == 2 assert sections[0].type == "qa" assert sections[1].type == "narrative" def test_llm_handles_no_qa_format(self): """Narrative-only text (like File L pages 1-13) produces only narrative sections.""" narrative_json = json.dumps({ "sections": [ { "type": "narrative", "heading": "Introduction", "content": "This document provides background on policy matters.", "start_page": 1, "end_page": 5, "has_table": False, }, { "type": "narrative", "heading": "Analysis", "content": "The analysis covers multiple dimensions.", "start_page": 5, "end_page": 13, "has_table": False, }, ] }) sections = parse_llm_structure_response(narrative_json) assert len(sections) == 2 assert all(s.type == "narrative" for s in sections) def test_llm_handles_speaking_notes(self): """發言要點 text with bullet points produces speaking_notes sections.""" notes_json = json.dumps({ "sections": [ { "type": "speaking_notes", "heading": "發言要點", "content": "⚫ 要點一:政策方向\n⚫ 要點二:實施計劃\n⚫ 要點三:預算安排", "start_page": 1, "end_page": 2, "has_table": False, }, ] }) sections = parse_llm_structure_response(notes_json) assert len(sections) == 1 assert sections[0].type == "speaking_notes" assert sections[0].content.count("⚫") == 3 def test_parse_markdown_fenced_json(self): """parse_llm_structure_response handles ```json ... ``` wrapped responses.""" fenced = '```json\n' + SAMPLE_LLM_RESPONSE + '\n```' sections = parse_llm_structure_response(fenced) assert len(sections) == 3 def test_parse_invalid_json_raises(self): """parse_llm_structure_response raises ValueError on non-JSON input.""" with pytest.raises(ValueError, match="Invalid JSON"): parse_llm_structure_response("this is not json") # --------------------------------------------------------------------------- # Test: Regex fast-pass # --------------------------------------------------------------------------- class TestRegexFastPass: def test_regex_fastpass_chinese(self): """Text with 問B1/答B1 markers detected by split_chinese_qa without LLM.""" text = ( "(A) 排水系統\n" "問 B1:古洞北的設計是否能抵禦氣候變化?\n" "答 B1:研究顧問已為古洞北新發展區進行了評估。\n" "問 B2:第二個問題是什麼?\n" "答 B2:這是第二個問題的答案。\n" ) sections = split_chinese_qa(text) assert len(sections) >= 2 # All should be QA type assert all(s.type == "qa" for s in sections) # First should have question containing 古洞北 assert "古洞北" in sections[0].question def test_regex_fastpass_chinese_no_match(self): """split_chinese_qa returns empty list when no markers found.""" text = "This is plain text without any Q&A markers." assert split_chinese_qa(text) == [] def test_regex_fastpass_english(self): """Text with Q1, Q2 markers detected by split_english_qa without LLM.""" text = ( "Background information here.\n\n" "Q1 What is the timeline for the project?\n" "The project is expected to complete by 2027.\n" "Q2 How much will it cost?\n" "The estimated cost is HK$500 million.\n" ) sections = split_english_qa(text) assert len(sections) >= 2 assert all(s.type == "qa" for s in sections) assert any("timeline" in (s.question or "").lower() for s in sections) def test_regex_fastpass_english_no_match(self): """split_english_qa returns empty list when no markers found.""" text = "純中文文本沒有英文問答標記。" assert split_english_qa(text) == [] # --------------------------------------------------------------------------- # Test: Multi-page tracking # --------------------------------------------------------------------------- class TestMultiPage: def test_multi_page_sections(self): """Sections with [PAGE_BREAK: N] markers spanning pages track correctly.""" pages = [ (1, "Header line\n(A) Water drainage\nSome intro text"), (2, "More drainage info\nFooter text X-1"), (3, "New section begins\n(B) Traffic planning"), ] text = preprocess_text(pages) # Should have page break markers assert "[PAGE_BREAK: 1]" in text assert "[PAGE_BREAK: 2]" in text assert "[PAGE_BREAK: 3]" in text # --------------------------------------------------------------------------- # Test: ABC contract # --------------------------------------------------------------------------- class TestABCContract: def test_abc_contract(self): """QuestionChunkingStrategy satisfies ChunkingStrategy ABC.""" mock_settings = MagicMock() mock_settings.qa_max_chunk_tokens = 3000 mock_settings.qa_include_internal_refs = True strategy = QuestionChunkingStrategy(settings=mock_settings) assert isinstance(strategy, ChunkingStrategy) def test_get_chunking_strategy_factory(self, mock_settings): """get_chunking_strategy returns correct strategy type.""" token_strat = get_chunking_strategy("token", mock_settings) assert isinstance(token_strat, ChunkingStrategy) q_strat = get_chunking_strategy("question", mock_settings) assert isinstance(q_strat, QuestionChunkingStrategy) # --------------------------------------------------------------------------- # Test: Page number reference # --------------------------------------------------------------------------- class TestPageNumberReference: def test_page_number_reference_question(self): """Page ref in metadata points to question (問) page, not answer page.""" sections = [ Section( type="qa", heading="(A) Topic", qa_id="A1", question="What is X?", answer="X is Y.", start_page=5, end_page=8, ), ] chunks = build_chunks_from_sections(sections) assert len(chunks) == 1 chunk_text, page_num, metadata = chunks[0] # Page number should be start_page (question location) assert page_num == 5 assert metadata.get("source_page_range") == [5, 8] # --------------------------------------------------------------------------- # Test: Size limit recursive split # --------------------------------------------------------------------------- class TestSizeLimit: def test_size_limit(self): """Oversized QA section > 3000 tokens gets recursively split with question prepended.""" # Create a QA pair with a very long answer long_answer = "\n\n".join(f"Paragraph {i}: " + "x" * 200 for i in range(80)) sections = [ Section( type="qa", heading="(A) Topic", qa_id="A1", question="What is the detailed plan?", answer=long_answer, start_page=2, end_page=5, has_table=False, ), ] # Use a small max_tokens to force splitting chunks = build_chunks_from_sections(sections, max_tokens=500) assert len(chunks) > 1 # Each chunk should have the question text prepended for chunk_text, page_num, metadata in chunks: assert "What is the detailed plan?" in chunk_text # Page number should always be the question page assert page_num == 2 # --------------------------------------------------------------------------- # Test: build_chunks_from_sections # --------------------------------------------------------------------------- class TestBuildChunksFromSections: def test_build_chunks_from_sections(self): """Verify chunk texts and metadata from sections list.""" sections = [ Section( type="qa", heading="(A) 排水系統", qa_id="A1", question="古洞北的設計是否能抵禦氣候變化?", answer="研究顧問已為古洞北進行了評估。", start_page=2, end_page=3, has_table=True, parent_topic="排水系統", ), Section( type="narrative", heading="(1) 住戶的安置補償", content="合資格住戶可選擇安置安排。", start_page=3, end_page=5, has_table=False, ), Section( type="speaking_notes", heading="發言要點", content="⚫ 要點一:政策方向\n⚫ 要點二:實施計劃", start_page=1, end_page=1, has_table=False, ), Section( type="toc", heading="目錄", content="Page 1 ... Page 2", start_page=1, end_page=1, has_table=False, ), ] chunks = build_chunks_from_sections(sections) # Should have: 1 QA + 1 narrative + 2 speaking_notes bullets + 0 toc = 4 assert len(chunks) >= 4 # First chunk: QA qa_text, qa_page, qa_meta = chunks[0] assert "古洞北" in qa_text assert qa_page == 2 assert qa_meta["section_type"] == "qa" assert qa_meta["question_id"] == "A1" assert qa_meta["question_index"] == 0 assert qa_meta["answer_contains_table"] is True assert qa_meta["section_heading"] == "(A) 排水系統" # Find the narrative chunk narr_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "narrative"] assert len(narr_chunks) == 1 narr_text, narr_page, narr_meta = narr_chunks[0] assert "住戶的安置補償" in narr_text assert "合資格住戶" in narr_text # Find speaking_notes chunks notes_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "speaking_notes"] assert len(notes_chunks) == 2 for t, p, m in notes_chunks: assert "要點" in t # No TOC chunks toc_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "toc"] assert len(toc_chunks) == 0 # --------------------------------------------------------------------------- # Test: preprocess_text # --------------------------------------------------------------------------- class TestPreprocessText: def test_preprocess_text(self): """Footer markers stripped, colons normalized, page breaks inserted.""" pages = [ (1, "Header\n(A) Section Title\nX-1\n2024-01-15"), (2, "Content with:fullwidth colon\nMore text:here"), ] result = preprocess_text(pages) # Should have page break markers assert "[PAGE_BREAK: 1]" in result assert "[PAGE_BREAK: 2]" in result # Fullwidth colons normalized to ASCII assert ":" not in result assert ":" in result # Page footer patterns should be stripped (X-1, dates like 2024-01-15) assert "X-1" not in result assert "2024-01-15" not in result # --------------------------------------------------------------------------- # Test: build_structure_detection_prompt # --------------------------------------------------------------------------- class TestBuildPrompt: def test_build_structure_detection_prompt(self): """Prompt contains key instructions for LLM classification.""" text = "Sample document text [PAGE_BREAK: 1]" prompt = build_structure_detection_prompt(text) assert "Hong Kong Legislative Council" in prompt or "Legislative Council" in prompt assert "qa" in prompt.lower() or "問" in prompt assert "narrative" in prompt.lower() assert "speaking_notes" in prompt.lower() or "speaking notes" in prompt.lower() assert text in prompt