legco_ai_assistant/backend/app/test/test_phase8_qa_chunking.py

"""Phase 8 tests: Q&A-pair chunking strategy (Sub-Phase 8.1).

Covers:
- LLM structure detection response parsing (parse_llm_structure_response)
- Mixed format handling (問/答 + section headings)
- Narrative-only text (no Q&A format)
- Speaking notes (發言要點) chunking by bullet
- Regex fast-pass for Chinese 問/答 format
- Regex fast-pass for English Q1/Q2 format
- Multi-page section tracking with [PAGE_BREAK] markers
- ChunkingStrategy ABC compliance
- Page number references question (問) page, not answer
- Size limit: oversized sections recursively split with heading preserved
- build_chunks_from_sections output verification
- preprocess_text: footer stripping, colon normalization, page break insertion
"""

import json
from typing import List, Tuple
from unittest.mock import AsyncMock, MagicMock

import pytest

from app.utils.qa_chunking import (
    Section,
    preprocess_text,
    build_structure_detection_prompt,
    parse_llm_structure_response,
    split_chinese_qa,
    split_english_qa,
    build_chunks_from_sections,
)
from app.utils.chunking import (
    ChunkingStrategy,
    QuestionChunkingStrategy,
    get_chunking_strategy,
)


# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------

@pytest.fixture
def mock_settings():
    """Minimal Settings mock with Q&A chunking defaults."""
    s = MagicMock()
    s.default_chunking_strategy = "question"
    s.qa_vision_enabled = False
    s.qa_max_chunk_tokens = 3000
    s.qa_structure_model = ""
    s.qa_include_internal_refs = True
    s.qa_cache_vision_results = True
    s.chunk_size = 1000
    s.chunk_overlap = 200
    s.llm_model_name = "test-model"
    s.llm_api_key = "test-key"
    s.llm_base_url = "https://example.com/v1"
    s.llm_timeout = 30.0
    s.llm_enable_thinking = False
    s.vllm_engine = False
    return s


SAMPLE_LLM_RESPONSE = json.dumps({
    "sections": [
        {
            "type": "qa",
            "heading": "(A) 排水系統",
            "qa_id": "A1",
            "question": "古洞北和粉嶺北新發展區的設計是否能抵禦氣候變化？",
            "answer": "研究顧問已為古洞北和粉嶺北新發展區研究範圍進行了評估。",
            "start_page": 2,
            "end_page": 3,
            "has_table": False,
            "parent_topic": "排水系統",
        },
        {
            "type": "narrative",
            "heading": "(1) 住戶的安置補償",
            "content": "合資格住戶可選擇(i)「須通過經濟狀況審查」安置安排。",
            "start_page": 2,
            "end_page": 5,
            "has_table": False,
        },
        {
            "type": "speaking_notes",
            "heading": "發言要點",
            "content": "⚫ 古洞北／粉嶺北新發展區是北部都會區內的新市鎮發展項目\n⚫ 第一期發展預計於2030年完成",
            "start_page": 1,
            "end_page": 2,
            "has_table": False,
        },
    ]
})


# ---------------------------------------------------------------------------
# Test: LLM structure detection parsing
# ---------------------------------------------------------------------------

class TestLLMStructureDetection:

    def test_llm_structure_detection(self):
        """parse_llm_structure_response correctly parses LLM JSON with qa/narrative/speaking_notes."""
        sections = parse_llm_structure_response(SAMPLE_LLM_RESPONSE)
        assert len(sections) == 3

        qa = sections[0]
        assert qa.type == "qa"
        assert qa.qa_id == "A1"
        assert qa.question == "古洞北和粉嶺北新發展區的設計是否能抵禦氣候變化？"
        assert qa.answer == "研究顧問已為古洞北和粉嶺北新發展區研究範圍進行了評估。"
        assert qa.start_page == 2
        assert qa.end_page == 3
        assert qa.heading == "(A) 排水系統"
        assert qa.parent_topic == "排水系統"

        narr = sections[1]
        assert narr.type == "narrative"
        assert narr.heading == "(1) 住戶的安置補償"
        assert "合資格住戶" in narr.content

        notes = sections[2]
        assert notes.type == "speaking_notes"
        assert "⚫" in notes.content

    def test_llm_handles_mixed_formats(self):
        """Document with 問/答 markers + section headings correctly classified."""
        mixed_json = json.dumps({
            "sections": [
                {
                    "type": "qa",
                    "heading": "(B) 交通",
                    "qa_id": "B1",
                    "question": "新建道路何時通車？",
                    "answer": "預計2027年通車。",
                    "start_page": 3,
                    "end_page": 4,
                    "has_table": False,
                },
                {
                    "type": "narrative",
                    "heading": "背景",
                    "content": "本文件說明交通規劃。",
                    "start_page": 1,
                    "end_page": 2,
                    "has_table": False,
                },
            ]
        })
        sections = parse_llm_structure_response(mixed_json)
        assert len(sections) == 2
        assert sections[0].type == "qa"
        assert sections[1].type == "narrative"

    def test_llm_handles_no_qa_format(self):
        """Narrative-only text (like File L pages 1-13) produces only narrative sections."""
        narrative_json = json.dumps({
            "sections": [
                {
                    "type": "narrative",
                    "heading": "Introduction",
                    "content": "This document provides background on policy matters.",
                    "start_page": 1,
                    "end_page": 5,
                    "has_table": False,
                },
                {
                    "type": "narrative",
                    "heading": "Analysis",
                    "content": "The analysis covers multiple dimensions.",
                    "start_page": 5,
                    "end_page": 13,
                    "has_table": False,
                },
            ]
        })
        sections = parse_llm_structure_response(narrative_json)
        assert len(sections) == 2
        assert all(s.type == "narrative" for s in sections)

    def test_llm_handles_speaking_notes(self):
        """發言要點 text with bullet points produces speaking_notes sections."""
        notes_json = json.dumps({
            "sections": [
                {
                    "type": "speaking_notes",
                    "heading": "發言要點",
                    "content": "⚫ 要點一：政策方向\n⚫ 要點二：實施計劃\n⚫ 要點三：預算安排",
                    "start_page": 1,
                    "end_page": 2,
                    "has_table": False,
                },
            ]
        })
        sections = parse_llm_structure_response(notes_json)
        assert len(sections) == 1
        assert sections[0].type == "speaking_notes"
        assert sections[0].content.count("⚫") == 3

    def test_parse_markdown_fenced_json(self):
        """parse_llm_structure_response handles ```json ... ``` wrapped responses."""
        fenced = '```json\n' + SAMPLE_LLM_RESPONSE + '\n```'
        sections = parse_llm_structure_response(fenced)
        assert len(sections) == 3

    def test_parse_invalid_json_raises(self):
        """parse_llm_structure_response raises ValueError on non-JSON input."""
        with pytest.raises(ValueError, match="Invalid JSON"):
            parse_llm_structure_response("this is not json")


# ---------------------------------------------------------------------------
# Test: Regex fast-pass
# ---------------------------------------------------------------------------

class TestRegexFastPass:

    def test_regex_fastpass_chinese(self):
        """Text with 問B1/答B1 markers detected by split_chinese_qa without LLM."""
        text = (
            "(A) 排水系統\n"
            "問 B1：古洞北的設計是否能抵禦氣候變化？\n"
            "答 B1：研究顧問已為古洞北新發展區進行了評估。\n"
            "問 B2：第二個問題是什麼？\n"
            "答 B2：這是第二個問題的答案。\n"
        )
        sections = split_chinese_qa(text)
        assert len(sections) >= 2
        # All should be QA type
        assert all(s.type == "qa" for s in sections)
        # First should have question containing 古洞北
        assert "古洞北" in sections[0].question

    def test_regex_fastpass_chinese_no_match(self):
        """split_chinese_qa returns empty list when no markers found."""
        text = "This is plain text without any Q&A markers."
        assert split_chinese_qa(text) == []

    def test_regex_fastpass_english(self):
        """Text with Q1, Q2 markers detected by split_english_qa without LLM."""
        text = (
            "Background information here.\n\n"
            "Q1 What is the timeline for the project?\n"
            "The project is expected to complete by 2027.\n"
            "Q2 How much will it cost?\n"
            "The estimated cost is HK$500 million.\n"
        )
        sections = split_english_qa(text)
        assert len(sections) >= 2
        assert all(s.type == "qa" for s in sections)
        assert any("timeline" in (s.question or "").lower() for s in sections)

    def test_regex_fastpass_english_no_match(self):
        """split_english_qa returns empty list when no markers found."""
        text = "純中文文本沒有英文問答標記。"
        assert split_english_qa(text) == []


# ---------------------------------------------------------------------------
# Test: Multi-page tracking
# ---------------------------------------------------------------------------

class TestMultiPage:

    def test_multi_page_sections(self):
        """Sections with [PAGE_BREAK: N] markers spanning pages track correctly."""
        pages = [
            (1, "Header line\n(A) Water drainage\nSome intro text"),
            (2, "More drainage info\nFooter text X-1"),
            (3, "New section begins\n(B) Traffic planning"),
        ]
        text = preprocess_text(pages)
        # Should have page break markers
        assert "[PAGE_BREAK: 1]" in text
        assert "[PAGE_BREAK: 2]" in text
        assert "[PAGE_BREAK: 3]" in text


# ---------------------------------------------------------------------------
# Test: ABC contract
# ---------------------------------------------------------------------------

class TestABCContract:

    def test_abc_contract(self):
        """QuestionChunkingStrategy satisfies ChunkingStrategy ABC."""
        mock_settings = MagicMock()
        mock_settings.qa_max_chunk_tokens = 3000
        mock_settings.qa_include_internal_refs = True
        strategy = QuestionChunkingStrategy(settings=mock_settings)
        assert isinstance(strategy, ChunkingStrategy)

    def test_get_chunking_strategy_factory(self, mock_settings):
        """get_chunking_strategy returns correct strategy type."""
        token_strat = get_chunking_strategy("token", mock_settings)
        assert isinstance(token_strat, ChunkingStrategy)

        q_strat = get_chunking_strategy("question", mock_settings)
        assert isinstance(q_strat, QuestionChunkingStrategy)


# ---------------------------------------------------------------------------
# Test: Page number reference
# ---------------------------------------------------------------------------

class TestPageNumberReference:

    def test_page_number_reference_question(self):
        """Page ref in metadata points to question (問) page, not answer page."""
        sections = [
            Section(
                type="qa",
                heading="(A) Topic",
                qa_id="A1",
                question="What is X?",
                answer="X is Y.",
                start_page=5,
                end_page=8,
            ),
        ]
        chunks = build_chunks_from_sections(sections)
        assert len(chunks) == 1
        chunk_text, page_num, metadata = chunks[0]
        # Page number should be start_page (question location)
        assert page_num == 5
        assert metadata.get("source_page_range") == [5, 8]


# ---------------------------------------------------------------------------
# Test: Size limit recursive split
# ---------------------------------------------------------------------------

class TestSizeLimit:

    def test_size_limit(self):
        """Oversized QA section > 3000 tokens gets recursively split with question prepended."""
        # Create a QA pair with a very long answer
        long_answer = "\n\n".join(f"Paragraph {i}: " + "x" * 200 for i in range(80))
        sections = [
            Section(
                type="qa",
                heading="(A) Topic",
                qa_id="A1",
                question="What is the detailed plan?",
                answer=long_answer,
                start_page=2,
                end_page=5,
                has_table=False,
            ),
        ]
        # Use a small max_tokens to force splitting
        chunks = build_chunks_from_sections(sections, max_tokens=500)
        assert len(chunks) > 1
        # Each chunk should have the question text prepended
        for chunk_text, page_num, metadata in chunks:
            assert "What is the detailed plan?" in chunk_text
            # Page number should always be the question page
            assert page_num == 2


# ---------------------------------------------------------------------------
# Test: build_chunks_from_sections
# ---------------------------------------------------------------------------

class TestBuildChunksFromSections:

    def test_build_chunks_from_sections(self):
        """Verify chunk texts and metadata from sections list."""
        sections = [
            Section(
                type="qa",
                heading="(A) 排水系統",
                qa_id="A1",
                question="古洞北的設計是否能抵禦氣候變化？",
                answer="研究顧問已為古洞北進行了評估。",
                start_page=2,
                end_page=3,
                has_table=True,
                parent_topic="排水系統",
            ),
            Section(
                type="narrative",
                heading="(1) 住戶的安置補償",
                content="合資格住戶可選擇安置安排。",
                start_page=3,
                end_page=5,
                has_table=False,
            ),
            Section(
                type="speaking_notes",
                heading="發言要點",
                content="⚫ 要點一：政策方向\n⚫ 要點二：實施計劃",
                start_page=1,
                end_page=1,
                has_table=False,
            ),
            Section(
                type="toc",
                heading="目錄",
                content="Page 1 ... Page 2",
                start_page=1,
                end_page=1,
                has_table=False,
            ),
        ]
        chunks = build_chunks_from_sections(sections)
        # Should have: 1 QA + 1 narrative + 2 speaking_notes bullets + 0 toc = 4
        assert len(chunks) >= 4

        # First chunk: QA
        qa_text, qa_page, qa_meta = chunks[0]
        assert "古洞北" in qa_text
        assert qa_page == 2
        assert qa_meta["section_type"] == "qa"
        assert qa_meta["question_id"] == "A1"
        assert qa_meta["question_index"] == 0
        assert qa_meta["answer_contains_table"] is True
        assert qa_meta["section_heading"] == "(A) 排水系統"

        # Find the narrative chunk
        narr_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "narrative"]
        assert len(narr_chunks) == 1
        narr_text, narr_page, narr_meta = narr_chunks[0]
        assert "住戶的安置補償" in narr_text
        assert "合資格住戶" in narr_text

        # Find speaking_notes chunks
        notes_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "speaking_notes"]
        assert len(notes_chunks) == 2
        for t, p, m in notes_chunks:
            assert "要點" in t

        # No TOC chunks
        toc_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "toc"]
        assert len(toc_chunks) == 0


# ---------------------------------------------------------------------------
# Test: preprocess_text
# ---------------------------------------------------------------------------

class TestPreprocessText:

    def test_preprocess_text(self):
        """Footer markers stripped, colons normalized, page breaks inserted."""
        pages = [
            (1, "Header\n(A) Section Title\nX-1\n2024-01-15"),
            (2, "Content with：fullwidth colon\nMore text：here"),
        ]
        result = preprocess_text(pages)

        # Should have page break markers
        assert "[PAGE_BREAK: 1]" in result
        assert "[PAGE_BREAK: 2]" in result

        # Fullwidth colons normalized to ASCII
        assert "：" not in result
        assert ":" in result

        # Page footer patterns should be stripped (X-1, dates like 2024-01-15)
        assert "X-1" not in result
        assert "2024-01-15" not in result


# ---------------------------------------------------------------------------
# Test: build_structure_detection_prompt
# ---------------------------------------------------------------------------

class TestBuildPrompt:

    def test_build_structure_detection_prompt(self):
        """Prompt contains key instructions for LLM classification."""
        text = "Sample document text [PAGE_BREAK: 1]"
        prompt = build_structure_detection_prompt(text)
        assert "Hong Kong Legislative Council" in prompt or "Legislative Council" in prompt
        assert "qa" in prompt.lower() or "問" in prompt
        assert "narrative" in prompt.lower()
        assert "speaking_notes" in prompt.lower() or "speaking notes" in prompt.lower()
        assert text in prompt