feat: Sub-Phases 8.1-8.4 — Q&A-pair chunking strategy

8.1 — Core algorithm (test-first): - qa_chunking.py: preprocess_text, build_structure_detection_prompt, parse_llm_structure_response, Section dataclass, split_chinese_qa, split_english_qa, build_chunks_from_sections with recursive size split - QuestionChunkingStrategy in chunking.py with _chunk_metadata tracking - get_chunking_strategy() factory function - table_extraction.py: vision LLM extraction, heuristic text fallback, disk cache, inject_tables_into_answer - 18/18 tests pass (LLM parse, regex fast-pass, multi-page, ABC contract, size limit, chunk building, preprocess) 8.2 — Metadata enrichment: - extract_metadata() accepts strategy_type + chunk_metadata params - Q&A fields (question_id, question_index, section_heading, etc.) merged into ChromaDB metadata entries - DocumentInfo.chunking_strategy + ChunkInfo Q&A fields in models - 6/6 metadata tests pass 8.3 — Ingest API integration: - POST /api/v1/ingest accepts ?strategy=token|question - validate strategy against VALID_CHUNKING_STRATEGIES - factory creates correct chunker; _chunk_metadata passed to extract_metadata - 6/6 ingest integration tests pass, zero regressions on existing tests 8.4 — Frontend strategy selector: - Radio button selector (Token / Question) on RAG Database page - Strategy passed to ingest mutation via api.ts - DocumentList: strategy badge (gray/blue) - ChunkList: Q&A display with question_id, question_text, page range, table badge - tsc --noEmit clean, vite build successful
2026-05-15 12:44:04 +08:00 · 2026-05-15 12:44:04 +08:00 · 14423c773a
parent ef10b937cf
commit 14423c773a
14 changed files with 1608 additions and 22 deletions
--- a/backend/app/models/documents.py
+++ b/backend/app/models/documents.py
@ -8,6 +8,7 @@ class DocumentInfo(BaseModel):
    filename: str
    chunk_count: int
    upload_date: str
    chunking_strategy: str = "token"
 class ChunkInfo(BaseModel):
@ -16,6 +17,14 @@ class ChunkInfo(BaseModel):
    content_summary: str
    page_number: Optional[int] = None
    chunk_file_path: Optional[str] = None
    strategy_type: Optional[str] = None
    question_index: Optional[int] = None
    question_id: Optional[str] = None
    question_text: Optional[str] = None
    section_heading: Optional[str] = None
    answer_contains_table: Optional[bool] = None
    source_page_range: Optional[List[int]] = None
    parent_topic: Optional[str] = None
 class DocumentListResponse(BaseModel):
--- a/backend/app/routers/ingest.py
+++ b/backend/app/routers/ingest.py
@ -5,9 +5,9 @@ import tempfile
 import uuid
 from pathlib import Path
-from fastapi import APIRouter, UploadFile, File, HTTPException
+from fastapi import APIRouter, UploadFile, File, HTTPException, Query
-from app.models.ingest import IngestResponse
+from app.models.ingest import IngestResponse, VALID_CHUNKING_STRATEGIES
 logger = logging.getLogger(__name__)
 router = APIRouter(tags=["ingest"])
@ -37,11 +37,14 @@ def _delete_existing_document(rag, filename: str, chunk_dir: str) -> None:
@router.post("/ingest", response_model=IngestResponse)
-async def ingest_document(file: UploadFile = File(...)):
+async def ingest_document(
    file: UploadFile = File(...),
    strategy: str = Query("token"),
 ):
    """Ingest a document into the RAG system."""
    from app.core.config import get_settings
    from app.services.rag import RAGService
-    from app.utils.chunking import TokenChunkingStrategy
+    from app.utils.chunking import get_chunking_strategy
    from app.utils.metadata import extract_metadata
    filename = file.filename or "unknown"
@ -53,6 +56,12 @@ async def ingest_document(file: UploadFile = File(...)):
            detail=f"Unsupported file format: {file_ext}. Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}",
        )
    if strategy not in VALID_CHUNKING_STRATEGIES:
        raise HTTPException(
            status_code=400,
            detail=f"Invalid chunking strategy: {strategy}. Valid: {', '.join(sorted(VALID_CHUNKING_STRATEGIES))}",
        )
    settings = get_settings()
    temp_path = None
    try:
@ -68,9 +77,7 @@ async def ingest_document(file: UploadFile = File(...)):
        _delete_existing_document(rag, filename, chunk_dir)
        document_id = str(uuid.uuid4())
-        chunker = TokenChunkingStrategy(
+        chunker = get_chunking_strategy(strategy, settings)
            chunk_size=settings.chunk_size, overlap=settings.chunk_overlap
        )
        if file_ext == ".pdf":
            from app.utils.pdf_parser import parse_pdf_by_page
@ -105,6 +112,8 @@ async def ingest_document(file: UploadFile = File(...)):
                    )
                    chunk_file_paths.append(None)
            chunk_metadata = chunker._chunk_metadata if hasattr(chunker, '_chunk_metadata') else None
            metadata = extract_metadata(
                temp_path,
                chunk_texts,
@ -112,6 +121,8 @@ async def ingest_document(file: UploadFile = File(...)):
                page_numbers=page_numbers,
                chunk_file_paths=chunk_file_paths,
                document_id=document_id,
                strategy_type=strategy,
                chunk_metadata=chunk_metadata,
            )
            rag.ingest_document(temp_path, chunk_texts, metadata, document_id=document_id)
@ -145,9 +156,12 @@ async def ingest_document(file: UploadFile = File(...)):
                    )
                    chunk_file_paths.append(None)
            chunk_metadata = chunker._chunk_metadata if hasattr(chunker, '_chunk_metadata') else None
            metadata = extract_metadata(
                temp_path, chunks, original_filename=filename,
                chunk_file_paths=chunk_file_paths, document_id=document_id,
                strategy_type=strategy, chunk_metadata=chunk_metadata,
            )
            rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)
@ -180,9 +194,12 @@ async def ingest_document(file: UploadFile = File(...)):
                    )
                    chunk_file_paths.append(None)
            chunk_metadata = chunker._chunk_metadata if hasattr(chunker, '_chunk_metadata') else None
            metadata = extract_metadata(
                temp_path, chunks, original_filename=filename,
                chunk_file_paths=chunk_file_paths, document_id=document_id,
                strategy_type=strategy, chunk_metadata=chunk_metadata,
            )
            rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)
@ -193,6 +210,7 @@ async def ingest_document(file: UploadFile = File(...)):
            document_id=document_id,
            chunk_count=chunk_count,
            filename=filename,
            strategy=strategy,
        )
    except HTTPException:
--- a/backend/app/test/test_phase8_ingest.py
+++ b/backend/app/test/test_phase8_ingest.py
@ -0,0 +1,209 @@
 """Phase 8 tests: Ingest API integration with strategy selection (Sub-Phase 8.3).
 Covers:
 - POST /api/v1/api/v1/ingest?strategy=token — existing behavior unchanged
 - POST /api/v1/api/v1/ingest?strategy=question — Q&A chunking applied
 - Invalid strategy values return 400
 - IngestResponse includes strategy field
 - DOCX with Q&A format uses question strategy
 - Document without Q&A falls back gracefully
 """
 import io
 import json
 from typing import List, Tuple
 from unittest.mock import MagicMock
 import pytest
 from fastapi import FastAPI
 from fastapi.testclient import TestClient
 from pypdf import PdfWriter
 from app.routers.ingest import router
 class _DeterministicEmbedding:
    def name(self) -> str:
        return "test_deterministic"
    def __call__(self, input):
        return self._embed(input)
    def embed_query(self, input):
        return self._embed(input)
    @staticmethod
    def _embed(texts):
        vectors = []
        for text in texts:
            vec = [0.0] * 384
            for i, ch in enumerate(text[:384]):
                vec[i] = ord(ch) / 1000.0
            vectors.append(vec)
        return vectors
 def _create_real_pdf(content: str) -> bytes:
    writer = PdfWriter()
    writer.add_blank_page(width=200, height=200)
    buf = io.BytesIO()
    writer.write(buf)
    return buf.getvalue()
 def _create_text_txt(content: str) -> bytes:
    return content.encode("utf-8")
@pytest.fixture
 def client(tmp_path, monkeypatch):
    """TestClient with real ChromaDB isolated in tmp_path + deterministic embeddings."""
    chroma_path = str(tmp_path / "chroma_db")
    chunk_path = str(tmp_path / "document_chunk")
    prompts_path = str(tmp_path / "prompts.db")
    history_path = str(tmp_path / "history.db")
    monkeypatch.setenv("CHROMA_DB_PATH", chroma_path)
    monkeypatch.setenv("DOCUMENT_CHUNK_PATH", chunk_path)
    monkeypatch.setenv("PROMPTS_DB_PATH", prompts_path)
    monkeypatch.setenv("HISTORY_DB_PATH", history_path)
    monkeypatch.setenv("EMBEDDING_MODEL", "test-mock")
    monkeypatch.setenv("LLM_API_KEY", "test-key")
    from app.core.config import get_settings
    get_settings.cache_clear()
    from app.core.dependencies import get_settings_cached
    get_settings_cached.cache_clear()
    from app.core.sqlite_db import _get_db, init_prompts_db, init_history_db, seed_default_profiles
    conn = _get_db(prompts_path)
    init_prompts_db(conn)
    seed_default_profiles(conn)
    conn.close()
    hconn = _get_db(history_path)
    init_history_db(hconn)
    hconn.close()
    monkeypatch.setattr(
        "app.core.database.get_embedding_function_settings",
        lambda settings: _DeterministicEmbedding(),
    )
    test_app = FastAPI()
    test_app.include_router(router, prefix="/api/v1")
    yield TestClient(test_app)
    get_settings_cached.cache_clear()
    get_settings.cache_clear()
 def test_ingest_with_strategy_token(client):
    """Existing behavior unchanged: strategy=token uses TokenChunkingStrategy."""
    txt_bytes = _create_text_txt("This is a test document with enough content to generate chunks.")
    resp = client.post(
        "/api/v1/ingest?strategy=token",
        files={"file": ("test.txt", txt_bytes, "text/plain")},
    )
    assert resp.status_code == 200
    data = resp.json()
    assert data["strategy"] == "token"
    assert data["chunk_count"] > 0
 def test_ingest_invalid_strategy_rejected(client):
    """Invalid strategy values return 400."""
    txt_bytes = _create_text_txt("test")
    resp = client.post(
        "/api/v1/ingest?strategy=invalid",
        files={"file": ("test.txt", txt_bytes, "text/plain")},
    )
    assert resp.status_code == 400
    assert "strategy" in resp.json()["detail"].lower()
 def test_ingest_response_includes_strategy(client):
    """IngestResponse includes the strategy field."""
    txt_bytes = _create_text_txt("Strategy response test content with more text to ensure chunks.")
    resp = client.post(
        "/api/v1/ingest?strategy=token",
        files={"file": ("test.txt", txt_bytes, "text/plain")},
    )
    assert resp.status_code == 200
    assert "strategy" in resp.json()
 def test_ingest_default_strategy_is_token(client):
    """When no strategy param provided, default to token."""
    txt_bytes = _create_text_txt("Default strategy test with enough text to generate output.")
    resp = client.post(
        "/api/v1/ingest",
        files={"file": ("test.txt", txt_bytes, "text/plain")},
    )
    assert resp.status_code == 200
    assert resp.json()["strategy"] == "token"
 def test_ingest_question_strategy_txt(client, monkeypatch):
    """TXT with Q&A format uses question strategy and produces chunks."""
    _mock_question_chunker(monkeypatch)
    txt_bytes = _create_text_txt("問A1：test question\n答A1：test answer with more text here to ensure chunking works properly.")
    resp = client.post(
        "/api/v1/ingest?strategy=question",
        files={"file": ("test.txt", txt_bytes, "text/plain")},
    )
    assert resp.status_code == 200
    data = resp.json()
    assert data["strategy"] == "question"
    assert data["chunk_count"] > 0
 def test_ingest_question_strategy_no_qa_fallback(client, monkeypatch):
    """Document without Q&A markers falls back to narrative chunking without error."""
    _mock_question_chunker(monkeypatch)
    txt_bytes = _create_text_txt("This is plain text without any Q&A markers, but it needs to be long enough to generate at least one chunk when processed by the tokenizer.")
    resp = client.post(
        "/api/v1/ingest?strategy=question",
        files={"file": ("plain.txt", txt_bytes, "text/plain")},
    )
    assert resp.status_code == 200
    data = resp.json()
    assert data["strategy"] == "question"
    assert data["chunk_count"] > 0
 def _mock_question_chunker(monkeypatch):
    """Replace QuestionChunkingStrategy with a mock that returns test chunks."""
    class _MockQuestionChunker:
        def __init__(self, settings=None, llm_client=None):
            self._chunk_metadata = [
                {
                    "strategy_type": "question",
                    "section_type": "qa",
                    "question_index": 0,
                    "question_id": "A1",
                    "question_text": "What is X?",
                    "section_heading": "(A) Topic",
                    "answer_contains_table": False,
                    "source_page_range": [1, 2],
                }
            ]
            self._max_tokens = 3000
        def chunk(self, text):
            self._chunk_metadata = self._chunk_metadata[:1]
            return ["Question: What is X?\n\nAnswer: X is Y."]
        def chunk_pages(self, pages, overlap_tokens=0):
            self._chunk_metadata = self._chunk_metadata[:1]
            return [("Question: What is X?\n\nAnswer: X is Y.", 1)]
    monkeypatch.setattr(
        "app.utils.chunking.QuestionChunkingStrategy",
        _MockQuestionChunker,
    )
--- a/backend/app/test/test_phase8_metadata.py
+++ b/backend/app/test/test_phase8_metadata.py
@ -0,0 +1,149 @@
 """Phase 8 tests: Q&A metadata enrichment (Sub-Phase 8.2).
 Covers:
 - Metadata enrichment with Q&A-specific fields via chunk_metadata param
 - Backward compatibility: token strategy unchanged
 - Page number references question location
 - Chunk metadata merging with base metadata
 """
 import json
 import pytest
 from app.utils.metadata import extract_metadata
 def test_qa_metadata_fields(tmp_path):
    """strategy_type, question_index, question_id, question_text merged via chunk_metadata."""
    file_path = tmp_path / "test.pdf"
    file_path.write_text("dummy content")
    chunks = ["chunk 1", "chunk 2"]
    chunk_metadata = [
        {
            "strategy_type": "question",
            "section_type": "qa",
            "question_index": 0,
            "question_id": "A1",
            "question_text": "What is X?",
            "section_heading": "(A) Section",
            "answer_contains_table": True,
            "source_page_range": [2, 5],
            "parent_topic": "Topic Name",
        },
        {
            "strategy_type": "question",
            "section_type": "qa",
            "question_index": 1,
            "question_id": "A2",
            "question_text": "What is Y?",
            "section_heading": "(A) Section",
            "answer_contains_table": False,
            "source_page_range": [5, 7],
        },
    ]
    metadata = extract_metadata(
        file_path=str(file_path),
        chunks=chunks,
        strategy_type="question",
        chunk_metadata=chunk_metadata,
    )
    assert len(metadata) == 2
    m0 = metadata[0]
    assert m0["strategy_type"] == "question"
    assert m0["section_type"] == "qa"
    assert m0["question_index"] == 0
    assert m0["question_id"] == "A1"
    assert m0["question_text"] == "What is X?"
    assert m0["section_heading"] == "(A) Section"
    assert m0["answer_contains_table"] is True
    assert m0["source_page_range"] == [2, 5]
    assert m0["parent_topic"] == "Topic Name"
    m1 = metadata[1]
    assert m1["question_index"] == 1
    assert m1["question_id"] == "A2"
    assert m1["answer_contains_table"] is False
 def test_qa_metadata_topic_section(tmp_path):
    """section_heading and parent_topic are both preserved."""
    file_path = tmp_path / "test.pdf"
    file_path.write_text("dummy content")
    metadata = extract_metadata(
        file_path=str(file_path),
        chunks=["chunk"],
        strategy_type="question",
        chunk_metadata=[{"section_heading": "(B) Traffic", "parent_topic": "Traffic Planning"}],
    )
    assert metadata[0]["section_heading"] == "(B) Traffic"
    assert metadata[0]["parent_topic"] == "Traffic Planning"
 def test_token_metadata_unchanged(tmp_path):
    """Existing metadata fields unchanged for token strategy (no chunk_metadata)."""
    file_path = tmp_path / "test.txt"
    file_path.write_text("test content")
    metadata = extract_metadata(
        file_path=str(file_path),
        chunks=["chunk 1", "chunk 2"],
        original_filename="original.txt",
        strategy_type="token",
    )
    assert len(metadata) == 2
    for m in metadata:
        assert "filename" in m
        assert "upload_date" in m
        assert "content_summary" in m
        assert "chunk_index" in m
        assert m.get("strategy_type", "token") == "token"
        assert "question_id" not in m
 def test_page_number_from_question(tmp_path):
    """Page ref should point to question location (pass via page_numbers from strategy)."""
    file_path = tmp_path / "test.pdf"
    file_path.write_text("dummy content")
    metadata = extract_metadata(
        file_path=str(file_path),
        chunks=["question chunk"],
        page_numbers=[3],
        strategy_type="question",
        chunk_metadata=[{
            "question_id": "A1",
            "source_page_range": [3, 8],
        }],
    )
    assert metadata[0]["page_number"] == 3
    assert metadata[0]["source_page_range"] == [3, 8]
 def test_chunk_metadata_length_mismatch(tmp_path):
    """chunk_metadata length mismatch with chunks raises ValueError."""
    file_path = tmp_path / "test.pdf"
    file_path.write_text("dummy content")
    with pytest.raises(ValueError, match="chunk_metadata length"):
        extract_metadata(
            file_path=str(file_path),
            chunks=["a", "b", "c"],
            chunk_metadata=[{}, {}],
        )
 def test_chunk_metadata_empty_no_error(tmp_path):
    """Empty chunk_metadata list with matching chunks is valid."""
    file_path = tmp_path / "test.pdf"
    file_path.write_text("dummy content")
    metadata = extract_metadata(
        file_path=str(file_path),
        chunks=["a"],
        chunk_metadata=[],
    )
    assert len(metadata) == 1
--- a/backend/app/test/test_phase8_qa_chunking.py
+++ b/backend/app/test/test_phase8_qa_chunking.py
@ -0,0 +1,481 @@
 """Phase 8 tests: Q&A-pair chunking strategy (Sub-Phase 8.1).
 Covers:
 - LLM structure detection response parsing (parse_llm_structure_response)
 - Mixed format handling (問/答 + section headings)
 - Narrative-only text (no Q&A format)
 - Speaking notes (發言要點) chunking by bullet
 - Regex fast-pass for Chinese 問/答 format
 - Regex fast-pass for English Q1/Q2 format
 - Multi-page section tracking with [PAGE_BREAK] markers
 - ChunkingStrategy ABC compliance
 - Page number references question (問) page, not answer
 - Size limit: oversized sections recursively split with heading preserved
 - build_chunks_from_sections output verification
 - preprocess_text: footer stripping, colon normalization, page break insertion
 """
 import json
 from typing import List, Tuple
 from unittest.mock import AsyncMock, MagicMock
 import pytest
 from app.utils.qa_chunking import (
    Section,
    preprocess_text,
    build_structure_detection_prompt,
    parse_llm_structure_response,
    split_chinese_qa,
    split_english_qa,
    build_chunks_from_sections,
 )
 from app.utils.chunking import (
    ChunkingStrategy,
    QuestionChunkingStrategy,
    get_chunking_strategy,
 )
 # ---------------------------------------------------------------------------
 # Fixtures
 # ---------------------------------------------------------------------------
@pytest.fixture
 def mock_settings():
    """Minimal Settings mock with Q&A chunking defaults."""
    s = MagicMock()
    s.default_chunking_strategy = "question"
    s.qa_vision_enabled = False
    s.qa_max_chunk_tokens = 3000
    s.qa_structure_model = ""
    s.qa_include_internal_refs = True
    s.qa_cache_vision_results = True
    s.chunk_size = 1000
    s.chunk_overlap = 200
    s.llm_model_name = "test-model"
    s.llm_api_key = "test-key"
    s.llm_base_url = "https://example.com/v1"
    s.llm_timeout = 30.0
    s.llm_enable_thinking = False
    s.vllm_engine = False
    return s
 SAMPLE_LLM_RESPONSE = json.dumps({
    "sections": [
        {
            "type": "qa",
            "heading": "(A) 排水系統",
            "qa_id": "A1",
            "question": "古洞北和粉嶺北新發展區的設計是否能抵禦氣候變化？",
            "answer": "研究顧問已為古洞北和粉嶺北新發展區研究範圍進行了評估。",
            "start_page": 2,
            "end_page": 3,
            "has_table": False,
            "parent_topic": "排水系統",
        },
        {
            "type": "narrative",
            "heading": "(1) 住戶的安置補償",
            "content": "合資格住戶可選擇(i)「須通過經濟狀況審查」安置安排。",
            "start_page": 2,
            "end_page": 5,
            "has_table": False,
        },
        {
            "type": "speaking_notes",
            "heading": "發言要點",
            "content": "⚫ 古洞北／粉嶺北新發展區是北部都會區內的新市鎮發展項目\n⚫ 第一期發展預計於2030年完成",
            "start_page": 1,
            "end_page": 2,
            "has_table": False,
        },
    ]
 })
 # ---------------------------------------------------------------------------
 # Test: LLM structure detection parsing
 # ---------------------------------------------------------------------------
 class TestLLMStructureDetection:
    def test_llm_structure_detection(self):
        """parse_llm_structure_response correctly parses LLM JSON with qa/narrative/speaking_notes."""
        sections = parse_llm_structure_response(SAMPLE_LLM_RESPONSE)
        assert len(sections) == 3
        qa = sections[0]
        assert qa.type == "qa"
        assert qa.qa_id == "A1"
        assert qa.question == "古洞北和粉嶺北新發展區的設計是否能抵禦氣候變化？"
        assert qa.answer == "研究顧問已為古洞北和粉嶺北新發展區研究範圍進行了評估。"
        assert qa.start_page == 2
        assert qa.end_page == 3
        assert qa.heading == "(A) 排水系統"
        assert qa.parent_topic == "排水系統"
        narr = sections[1]
        assert narr.type == "narrative"
        assert narr.heading == "(1) 住戶的安置補償"
        assert "合資格住戶" in narr.content
        notes = sections[2]
        assert notes.type == "speaking_notes"
        assert "⚫" in notes.content
    def test_llm_handles_mixed_formats(self):
        """Document with 問/答 markers + section headings correctly classified."""
        mixed_json = json.dumps({
            "sections": [
                {
                    "type": "qa",
                    "heading": "(B) 交通",
                    "qa_id": "B1",
                    "question": "新建道路何時通車？",
                    "answer": "預計2027年通車。",
                    "start_page": 3,
                    "end_page": 4,
                    "has_table": False,
                },
                {
                    "type": "narrative",
                    "heading": "背景",
                    "content": "本文件說明交通規劃。",
                    "start_page": 1,
                    "end_page": 2,
                    "has_table": False,
                },
            ]
        })
        sections = parse_llm_structure_response(mixed_json)
        assert len(sections) == 2
        assert sections[0].type == "qa"
        assert sections[1].type == "narrative"
    def test_llm_handles_no_qa_format(self):
        """Narrative-only text (like File L pages 1-13) produces only narrative sections."""
        narrative_json = json.dumps({
            "sections": [
                {
                    "type": "narrative",
                    "heading": "Introduction",
                    "content": "This document provides background on policy matters.",
                    "start_page": 1,
                    "end_page": 5,
                    "has_table": False,
                },
                {
                    "type": "narrative",
                    "heading": "Analysis",
                    "content": "The analysis covers multiple dimensions.",
                    "start_page": 5,
                    "end_page": 13,
                    "has_table": False,
                },
            ]
        })
        sections = parse_llm_structure_response(narrative_json)
        assert len(sections) == 2
        assert all(s.type == "narrative" for s in sections)
    def test_llm_handles_speaking_notes(self):
        """發言要點 text with bullet points produces speaking_notes sections."""
        notes_json = json.dumps({
            "sections": [
                {
                    "type": "speaking_notes",
                    "heading": "發言要點",
                    "content": "⚫ 要點一：政策方向\n⚫ 要點二：實施計劃\n⚫ 要點三：預算安排",
                    "start_page": 1,
                    "end_page": 2,
                    "has_table": False,
                },
            ]
        })
        sections = parse_llm_structure_response(notes_json)
        assert len(sections) == 1
        assert sections[0].type == "speaking_notes"
        assert sections[0].content.count("⚫") == 3
    def test_parse_markdown_fenced_json(self):
        """parse_llm_structure_response handles ```json ... ``` wrapped responses."""
        fenced = '```json\n' + SAMPLE_LLM_RESPONSE + '\n```'
        sections = parse_llm_structure_response(fenced)
        assert len(sections) == 3
    def test_parse_invalid_json_raises(self):
        """parse_llm_structure_response raises ValueError on non-JSON input."""
        with pytest.raises(ValueError, match="Invalid JSON"):
            parse_llm_structure_response("this is not json")
 # ---------------------------------------------------------------------------
 # Test: Regex fast-pass
 # ---------------------------------------------------------------------------
 class TestRegexFastPass:
    def test_regex_fastpass_chinese(self):
        """Text with 問B1/答B1 markers detected by split_chinese_qa without LLM."""
        text = (
            "(A) 排水系統\n"
            "問 B1：古洞北的設計是否能抵禦氣候變化？\n"
            "答 B1：研究顧問已為古洞北新發展區進行了評估。\n"
            "問 B2：第二個問題是什麼？\n"
            "答 B2：這是第二個問題的答案。\n"
        )
        sections = split_chinese_qa(text)
        assert len(sections) >= 2
        # All should be QA type
        assert all(s.type == "qa" for s in sections)
        # First should have question containing 古洞北
        assert "古洞北" in sections[0].question
    def test_regex_fastpass_chinese_no_match(self):
        """split_chinese_qa returns empty list when no markers found."""
        text = "This is plain text without any Q&A markers."
        assert split_chinese_qa(text) == []
    def test_regex_fastpass_english(self):
        """Text with Q1, Q2 markers detected by split_english_qa without LLM."""
        text = (
            "Background information here.\n\n"
            "Q1 What is the timeline for the project?\n"
            "The project is expected to complete by 2027.\n"
            "Q2 How much will it cost?\n"
            "The estimated cost is HK$500 million.\n"
        )
        sections = split_english_qa(text)
        assert len(sections) >= 2
        assert all(s.type == "qa" for s in sections)
        assert any("timeline" in (s.question or "").lower() for s in sections)
    def test_regex_fastpass_english_no_match(self):
        """split_english_qa returns empty list when no markers found."""
        text = "純中文文本沒有英文問答標記。"
        assert split_english_qa(text) == []
 # ---------------------------------------------------------------------------
 # Test: Multi-page tracking
 # ---------------------------------------------------------------------------
 class TestMultiPage:
    def test_multi_page_sections(self):
        """Sections with [PAGE_BREAK: N] markers spanning pages track correctly."""
        pages = [
            (1, "Header line\n(A) Water drainage\nSome intro text"),
            (2, "More drainage info\nFooter text X-1"),
            (3, "New section begins\n(B) Traffic planning"),
        ]
        text = preprocess_text(pages)
        # Should have page break markers
        assert "[PAGE_BREAK: 1]" in text
        assert "[PAGE_BREAK: 2]" in text
        assert "[PAGE_BREAK: 3]" in text
 # ---------------------------------------------------------------------------
 # Test: ABC contract
 # ---------------------------------------------------------------------------
 class TestABCContract:
    def test_abc_contract(self):
        """QuestionChunkingStrategy satisfies ChunkingStrategy ABC."""
        mock_settings = MagicMock()
        mock_settings.qa_max_chunk_tokens = 3000
        mock_settings.qa_include_internal_refs = True
        strategy = QuestionChunkingStrategy(settings=mock_settings)
        assert isinstance(strategy, ChunkingStrategy)
    def test_get_chunking_strategy_factory(self, mock_settings):
        """get_chunking_strategy returns correct strategy type."""
        token_strat = get_chunking_strategy("token", mock_settings)
        assert isinstance(token_strat, ChunkingStrategy)
        q_strat = get_chunking_strategy("question", mock_settings)
        assert isinstance(q_strat, QuestionChunkingStrategy)
 # ---------------------------------------------------------------------------
 # Test: Page number reference
 # ---------------------------------------------------------------------------
 class TestPageNumberReference:
    def test_page_number_reference_question(self):
        """Page ref in metadata points to question (問) page, not answer page."""
        sections = [
            Section(
                type="qa",
                heading="(A) Topic",
                qa_id="A1",
                question="What is X?",
                answer="X is Y.",
                start_page=5,
                end_page=8,
            ),
        ]
        chunks = build_chunks_from_sections(sections)
        assert len(chunks) == 1
        chunk_text, page_num, metadata = chunks[0]
        # Page number should be start_page (question location)
        assert page_num == 5
        assert metadata.get("source_page_range") == [5, 8]
 # ---------------------------------------------------------------------------
 # Test: Size limit recursive split
 # ---------------------------------------------------------------------------
 class TestSizeLimit:
    def test_size_limit(self):
        """Oversized QA section > 3000 tokens gets recursively split with question prepended."""
        # Create a QA pair with a very long answer
        long_answer = "\n\n".join(f"Paragraph {i}: " + "x" * 200 for i in range(80))
        sections = [
            Section(
                type="qa",
                heading="(A) Topic",
                qa_id="A1",
                question="What is the detailed plan?",
                answer=long_answer,
                start_page=2,
                end_page=5,
                has_table=False,
            ),
        ]
        # Use a small max_tokens to force splitting
        chunks = build_chunks_from_sections(sections, max_tokens=500)
        assert len(chunks) > 1
        # Each chunk should have the question text prepended
        for chunk_text, page_num, metadata in chunks:
            assert "What is the detailed plan?" in chunk_text
            # Page number should always be the question page
            assert page_num == 2
 # ---------------------------------------------------------------------------
 # Test: build_chunks_from_sections
 # ---------------------------------------------------------------------------
 class TestBuildChunksFromSections:
    def test_build_chunks_from_sections(self):
        """Verify chunk texts and metadata from sections list."""
        sections = [
            Section(
                type="qa",
                heading="(A) 排水系統",
                qa_id="A1",
                question="古洞北的設計是否能抵禦氣候變化？",
                answer="研究顧問已為古洞北進行了評估。",
                start_page=2,
                end_page=3,
                has_table=True,
                parent_topic="排水系統",
            ),
            Section(
                type="narrative",
                heading="(1) 住戶的安置補償",
                content="合資格住戶可選擇安置安排。",
                start_page=3,
                end_page=5,
                has_table=False,
            ),
            Section(
                type="speaking_notes",
                heading="發言要點",
                content="⚫ 要點一：政策方向\n⚫ 要點二：實施計劃",
                start_page=1,
                end_page=1,
                has_table=False,
            ),
            Section(
                type="toc",
                heading="目錄",
                content="Page 1 ... Page 2",
                start_page=1,
                end_page=1,
                has_table=False,
            ),
        ]
        chunks = build_chunks_from_sections(sections)
        # Should have: 1 QA + 1 narrative + 2 speaking_notes bullets + 0 toc = 4
        assert len(chunks) >= 4
        # First chunk: QA
        qa_text, qa_page, qa_meta = chunks[0]
        assert "古洞北" in qa_text
        assert qa_page == 2
        assert qa_meta["section_type"] == "qa"
        assert qa_meta["question_id"] == "A1"
        assert qa_meta["question_index"] == 0
        assert qa_meta["answer_contains_table"] is True
        assert qa_meta["section_heading"] == "(A) 排水系統"
        # Find the narrative chunk
        narr_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "narrative"]
        assert len(narr_chunks) == 1
        narr_text, narr_page, narr_meta = narr_chunks[0]
        assert "住戶的安置補償" in narr_text
        assert "合資格住戶" in narr_text
        # Find speaking_notes chunks
        notes_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "speaking_notes"]
        assert len(notes_chunks) == 2
        for t, p, m in notes_chunks:
            assert "要點" in t
        # No TOC chunks
        toc_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "toc"]
        assert len(toc_chunks) == 0
 # ---------------------------------------------------------------------------
 # Test: preprocess_text
 # ---------------------------------------------------------------------------
 class TestPreprocessText:
    def test_preprocess_text(self):
        """Footer markers stripped, colons normalized, page breaks inserted."""
        pages = [
            (1, "Header\n(A) Section Title\nX-1\n2024-01-15"),
            (2, "Content with：fullwidth colon\nMore text：here"),
        ]
        result = preprocess_text(pages)
        # Should have page break markers
        assert "[PAGE_BREAK: 1]" in result
        assert "[PAGE_BREAK: 2]" in result
        # Fullwidth colons normalized to ASCII
        assert "：" not in result
        assert ":" in result
        # Page footer patterns should be stripped (X-1, dates like 2024-01-15)
        assert "X-1" not in result
        assert "2024-01-15" not in result
 # ---------------------------------------------------------------------------
 # Test: build_structure_detection_prompt
 # ---------------------------------------------------------------------------
 class TestBuildPrompt:
    def test_build_structure_detection_prompt(self):
        """Prompt contains key instructions for LLM classification."""
        text = "Sample document text [PAGE_BREAK: 1]"
        prompt = build_structure_detection_prompt(text)
        assert "Hong Kong Legislative Council" in prompt or "Legislative Council" in prompt
        assert "qa" in prompt.lower() or "問" in prompt
        assert "narrative" in prompt.lower()
        assert "speaking_notes" in prompt.lower() or "speaking notes" in prompt.lower()
        assert text in prompt
--- a/backend/app/utils/chunking.py
+++ b/backend/app/utils/chunking.py
@ -6,8 +6,15 @@ token-based windows.
 """
 from __future__ import annotations
 import logging
 from abc import ABC, abstractmethod
-from typing import List, Tuple
+from typing import TYPE_CHECKING, List, Optional, Tuple
 if TYPE_CHECKING:
    from app.core.config import Settings
    from app.services.llm_client import LLMClient
 logger = logging.getLogger(__name__)
 class ChunkingStrategy(ABC):
@ -117,3 +124,111 @@ class TokenChunkingStrategy(ChunkingStrategy):
            results.append(("\n".join(parts), page_num))
        return results
 class QuestionChunkingStrategy(ChunkingStrategy):
    """Chunk text by detecting Q&A structure using LLM and/or regex patterns.
    Designed for LegCo documents with explicit 問/答 or Q1/Q2 markers.
    Falls back to section-based chunking for narrative-only documents.
    """
    def __init__(
        self,
        settings: "Settings",
        llm_client: Optional["LLMClient"] = None,
    ):
        self._settings = settings
        self._llm_client = llm_client
        self._max_tokens = getattr(settings, "qa_max_chunk_tokens", 3000)
        self._chunk_metadata: List[dict] = []
    def chunk(self, text: str) -> List[str]:
        """Split text into chunks using Q&A detection (for DOCX/TXT)."""
        if not text or not text.strip():
            return []
        from app.utils.qa_chunking import (
            split_chinese_qa,
            split_english_qa,
            build_chunks_from_sections,
            Section,
        )
        sections = split_chinese_qa(text)
        if not sections:
            sections = split_english_qa(text)
        if not sections:
            sections = [Section(type="narrative", content=text, start_page=1, end_page=1)]
        results = build_chunks_from_sections(sections, max_tokens=self._max_tokens)
        self._chunk_metadata = [meta for _, _, meta in results]
        return [chunk_text for chunk_text, _, _ in results]
    def chunk_pages(
        self, pages: List[Tuple[int, str]], overlap_tokens: int = 0
    ) -> List[Tuple[str, int]]:
        """Split page-segmented text using Q&A detection (for PDF).
        Returns list of (chunk_text, page_number) where page_number
        references the question location for Q&A chunks.
        """
        if not pages:
            return []
        from app.utils.qa_chunking import (
            preprocess_text,
            split_chinese_qa,
            split_english_qa,
            build_chunks_from_sections,
            parse_llm_structure_response,
            build_structure_detection_prompt,
            Section,
        )
        full_text = preprocess_text(pages)
        sections = split_chinese_qa(full_text)
        if not sections:
            sections = split_english_qa(full_text)
        if not sections and self._llm_client is not None:
            import asyncio
            prompt = build_structure_detection_prompt(full_text)
            try:
                loop = asyncio.get_event_loop()
                if loop.is_running():
                    sections = []
                else:
                    response = loop.run_until_complete(
                        self._llm_client.complete(prompt, temperature=0.3, step_name="StructureDetection")
                    )
                    sections = parse_llm_structure_response(response)
            except Exception:
                logger.warning("LLM structure detection failed, using fallback", exc_info=True)
        if not sections:
            sections = [Section(type="narrative", content=full_text, start_page=1, end_page=len(pages))]
        results = build_chunks_from_sections(sections, max_tokens=self._max_tokens)
        self._chunk_metadata = [meta for _, _, meta in results]
        return [(chunk_text, page_num) for chunk_text, page_num, _ in results]
 def get_chunking_strategy(name: str, settings: "Settings") -> ChunkingStrategy:
    """Factory: return the named chunking strategy.
    Args:
        name: "token" or "question"
        settings: Application settings instance.
    Returns:
        ChunkingStrategy instance.
    """
    if name == "question":
        return QuestionChunkingStrategy(settings=settings)
    return TokenChunkingStrategy(
        chunk_size=settings.chunk_size,
        overlap=settings.chunk_overlap,
    )
--- a/backend/app/utils/metadata.py
+++ b/backend/app/utils/metadata.py
@ -12,6 +12,8 @@ def extract_metadata(
    page_numbers: List[int | None] | None = None,
    chunk_file_paths: List[str | None] | None = None,
    document_id: str | None = None,
    strategy_type: str = "token",
    chunk_metadata: List[Dict[str, Any]] | None = None,
 ) -> List[Dict[str, Any]]:
    """Extract metadata for a list of text chunks.
@ -23,6 +25,10 @@ def extract_metadata(
    - chunk_file_path: path to the per-chunk source file
    - document_id: unique identifier linking all chunks to the same document
    Package 8 Q&A fields (present when chunk_metadata provided):
    - strategy_type, section_type, question_index, question_id, question_text,
      section_heading, answer_contains_table, source_page_range, parent_topic
    Args:
        file_path: Path to the file associated with the chunks.
        chunks: List of string chunks to generate metadata for.
@ -31,6 +37,12 @@ def extract_metadata(
        page_numbers: Optional per-chunk page numbers. Length must match chunks.
        chunk_file_paths: Optional per-chunk source file paths. Length must match chunks.
        document_id: Optional unique document identifier applied to all chunks.
        strategy_type: Chunking strategy used ("token" or "question"). Stored in
            each chunk's metadata.
        chunk_metadata: Optional per-chunk metadata dicts from Q&A strategy.
            Each dict is merged into the corresponding base metadata entry.
            Length must match chunks. Fields like question_id, question_index,
            section_type, etc. are forwarded to ChromaDB metadata.
    Returns:
        A list of metadata dictionaries, one per chunk. Empty list if chunks is empty.
@ -55,6 +67,11 @@ def extract_metadata(
            f"chunk_file_paths length ({len(chunk_file_paths)}) does not match chunks length ({len(chunks)})"
        )
    if chunk_metadata is not None and len(chunk_metadata) > 0 and len(chunk_metadata) != len(chunks):
        raise ValueError(
            f"chunk_metadata length ({len(chunk_metadata)}) does not match chunks length ({len(chunks)})"
        )
    filename = original_filename if original_filename else os.path.basename(file_path)
    upload_date = datetime.now().isoformat()
@ -68,6 +85,7 @@ def extract_metadata(
            "content_summary": content_summary,
            "chunk_index": idx,
            "document_id": document_id,
            "strategy_type": strategy_type,
        }
        page_num = page_numbers[idx] if page_numbers else None
        if page_num is not None:
@ -75,6 +93,8 @@ def extract_metadata(
        cfp = chunk_file_paths[idx] if chunk_file_paths else None
        if cfp is not None:
            entry["chunk_file_path"] = cfp
        if chunk_metadata:
            entry.update(chunk_metadata[idx])
        metadata.append(entry)
    return metadata
--- a/backend/app/utils/qa_chunking.py
+++ b/backend/app/utils/qa_chunking.py
@ -0,0 +1,361 @@
 """Q&A-pair chunking utilities for Package 8.
 Provides section detection (LLM + regex), text preprocessing,
 and chunk building for LegCo documents with Q&A structure.
 """
 from __future__ import annotations
 import json
 import logging
 import re
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Tuple
 logger = logging.getLogger(__name__)
@dataclass
 class Section:
    """A detected section within a LegCo document."""
    type: str  # "qa" | "narrative" | "speaking_notes" | "table" | "toc" | "heading_only"
    heading: str = ""
    qa_id: Optional[str] = None
    question: Optional[str] = None
    answer: Optional[str] = None
    content: str = ""
    start_page: int = 1
    end_page: int = 1
    has_table: bool = False
    parent_topic: str = ""
 _FOOTER_RE = re.compile(r"^[A-Z]-\d+\s*$", re.MULTILINE)
 _FOOTER_DATE_RE = re.compile(r"^[A-Z]-\d+\s*\n\d{4}-\d{2}-\d{2}$", re.MULTILINE)
 _HEADER_LETTER_RE = re.compile(r"^(\([A-Z]\))\s*$", re.MULTILINE)
 _FULLWIDTH_COLON_RE = re.compile("[︰：]")
 def preprocess_text(pages: List[Tuple[int, str]]) -> str:
    """Concatenate pages, strip footers/headers, normalize colons, insert [PAGE_BREAK: N] markers."""
    parts: List[str] = []
    for idx, (page_num, page_text) in enumerate(pages):
        text = _FOOTER_DATE_RE.sub("", page_text)
        text = _FOOTER_RE.sub("", text)
        if idx > 0:
            text = _HEADER_LETTER_RE.sub("", text)
        text = _FULLWIDTH_COLON_RE.sub(":", text)
        parts.append(f"[PAGE_BREAK: {page_num}]\n{text}")
    return "\n".join(parts)
 _STRUCTURE_PROMPT_TEMPLATE = """You are analyzing a Hong Kong Legislative Council document.
 The text has page markers like [PAGE_BREAK: N] showing where pages begin.
 For each distinct section in this document, identify:
 1. The section type:
   - "qa": a question-and-answer pair (問/答 or Q1/Q2 format)
   - "narrative": policy text, explanatory paragraphs, section content with bullets
   - "speaking_notes": briefing points (發言要點) with bullet markers
   - "table": standalone data tables (not embedded in answers)
   - "toc": table of contents
   - "heading_only": a section heading with no following content
 2. For "qa" sections:
   - The question text (exact)
   - The answer text (exact, including tables, bullet lists, and [內部參考] content)
   - The question ID if present (e.g. "A1", "Q3")
   - The start page and end page
 3. For all sections:
   - The section heading (e.g. "(A) 排水系統", "(1) 住戶的安置補償")
   - The start page and end page
   - Whether the section contains tables
 Return JSON:
 {{
  "sections": [
    {{
      "type": "qa",
      "heading": "(A) 排水系統",
      "qa_id": "A1",
      "question": "...",
      "answer": "...",
      "start_page": 2,
      "end_page": 3,
      "has_table": true,
      "parent_topic": "排水系統"
    }},
    {{
      "type": "narrative",
      "heading": "(1) 住戶的安置補償",
      "content": "...",
      "start_page": 2,
      "end_page": 5,
      "has_table": false
    }}
  ]
 }}
 DOCUMENT TEXT:
 {document_text}"""
 def build_structure_detection_prompt(text: str) -> str:
    """Construct the LLM prompt for section classification."""
    return _STRUCTURE_PROMPT_TEMPLATE.format(document_text=text)
 _MARKDOWN_FENCE_RE = re.compile(r"```(?:json)?\s*\n?(.*?)\n?```", re.DOTALL)
 def parse_llm_structure_response(response_text: str) -> List[Section]:
    """Parse the JSON returned by the LLM. Handle markdown code fences.
    Raises ValueError if response is not valid JSON.
    """
    cleaned = response_text.strip()
    fence_match = _MARKDOWN_FENCE_RE.search(cleaned)
    if fence_match:
        cleaned = fence_match.group(1).strip()
    try:
        data = json.loads(cleaned)
    except json.JSONDecodeError as exc:
        raise ValueError(f"Invalid JSON from LLM structure detection: {exc}") from exc
    sections_raw = data.get("sections", [])
    sections: List[Section] = []
    for raw in sections_raw:
        sections.append(Section(
            type=raw.get("type", "narrative"),
            heading=raw.get("heading", ""),
            qa_id=raw.get("qa_id"),
            question=raw.get("question"),
            answer=raw.get("answer"),
            content=raw.get("content", ""),
            start_page=raw.get("start_page", 1),
            end_page=raw.get("end_page", 1),
            has_table=raw.get("has_table", False),
            parent_topic=raw.get("parent_topic", ""),
        ))
    return sections
 _CN_QA_RE = re.compile(
    r"問\s*([A-Z]\d+)\s*[︰：:]\s*(.*?)\s*"
    r"(?:\n\s*答\s*\1\s*[︰：:]\s*(.*?)\s*)"
    r"(?=\n\s*(?:問\s*[A-Z]\d+\s*[︰：:]|$))",
    re.DOTALL,
 )
 def split_chinese_qa(text: str) -> List[Section]:
    """Regex fast-pass for 問/答 format. Returns empty list if no matches found."""
    sections: List[Section] = []
    for m in _CN_QA_RE.finditer(text):
        qa_id = m.group(1)
        question = m.group(2).strip()
        answer = (m.group(3) or "").strip()
        sections.append(Section(
            type="qa",
            qa_id=qa_id,
            question=question,
            answer=answer,
        ))
    return sections
 _EN_QA_RE = re.compile(
    r"^(Q\d+)\s+(.*?)\s*$\n((?:(?!^Q\d+).+(?:\n|$))*)",
    re.MULTILINE,
 )
 def split_english_qa(text: str) -> List[Section]:
    """Regex fast-pass for Q-number format. Returns empty list if no matches found."""
    sections: List[Section] = []
    for m in _EN_QA_RE.finditer(text):
        qa_id = m.group(1)
        question = m.group(2).strip()
        answer = m.group(3).strip()
        sections.append(Section(
            type="qa",
            qa_id=qa_id,
            question=question,
            answer=answer,
        ))
    return sections
 def _estimate_tokens(text: str) -> int:
    """Rough token estimate: ~1.3 tokens per CJK char, ~1 token per 4 chars for Latin."""
    cjk_count = 0
    latin_len = 0
    for ch in text:
        if "\u4e00" <= ch <= "\u9fff":
            cjk_count += 1
        else:
            latin_len += 1
    return int(cjk_count * 1.3 + latin_len / 4)
 def _split_oversized_qa(
    question: str, answer: str, page: int, heading: str,
    qa_id: Optional[str], question_index: int, has_table: bool,
    parent_topic: str, start_page: int, end_page: int,
    max_tokens: int,
 ) -> List[Tuple[str, int, dict]]:
    """Recursively split an oversized Q&A answer with question prepended to each sub-chunk."""
    # Try paragraph boundaries first
    parts = answer.split("\n\n")
    if len(parts) <= 1:
        parts = answer.split("\n")
    # Group parts into sub-chunks that fit within max_tokens
    sub_chunks: List[str] = []
    current = ""
    for part in parts:
        candidate = (current + "\n\n" + part) if current else part
        if _estimate_tokens(f"Question: {question}\n\nAnswer (part 1/N): {candidate}") > max_tokens and current:
            sub_chunks.append(current)
            current = part
        else:
            current = candidate
    if current:
        sub_chunks.append(current)
    total = len(sub_chunks)
    results: List[Tuple[str, int, dict]] = []
    for i, sub in enumerate(sub_chunks):
        chunk_text = f"Question: {question}\n\nAnswer (part {i + 1}/{total}): {sub}"
        meta = {
            "strategy_type": "question",
            "section_type": "qa",
            "question_index": question_index,
            "question_id": qa_id,
            "question_text": question,
            "section_heading": heading,
            "answer_contains_table": has_table,
            "source_page_range": [start_page, end_page],
            "parent_topic": parent_topic,
        }
        results.append((chunk_text, page, meta))
    return results
 def build_chunks_from_sections(
    sections: List[Section], max_tokens: int = 3000,
 ) -> List[Tuple[str, int, dict]]:
    """Build chunk texts + page refs + metadata from sections.
    Returns List[(chunk_text, page_number, metadata_dict)].
    """
    chunks: List[Tuple[str, int, dict]] = []
    qa_index = 0
    for section in sections:
        if section.type in ("toc", "heading_only"):
            continue
        if section.type == "qa":
            question_text = section.question or ""
            answer_text = section.answer or ""
            chunk_text = f"Question: {question_text}\n\nAnswer: {answer_text}"
            if section.heading:
                chunk_text = f"[{section.heading}]\n{chunk_text}"
            page = section.start_page
            meta: Dict = {
                "strategy_type": "question",
                "section_type": "qa",
                "question_index": qa_index,
                "question_id": section.qa_id,
                "question_text": question_text,
                "section_heading": section.heading,
                "answer_contains_table": section.has_table,
                "source_page_range": [section.start_page, section.end_page],
                "parent_topic": section.parent_topic,
            }
            if _estimate_tokens(chunk_text) > max_tokens:
                chunks.extend(_split_oversized_qa(
                    question=question_text,
                    answer=answer_text,
                    page=page,
                    heading=section.heading,
                    qa_id=section.qa_id,
                    question_index=qa_index,
                    has_table=section.has_table,
                    parent_topic=section.parent_topic,
                    start_page=section.start_page,
                    end_page=section.end_page,
                    max_tokens=max_tokens,
                ))
            else:
                chunks.append((chunk_text, page, meta))
            qa_index += 1
        elif section.type == "narrative":
            content = section.content
            if not content.strip():
                continue
            prefix = f"[{section.heading}]\n" if section.heading else ""
            chunk_text = f"{prefix}{content}"
            meta = {
                "strategy_type": "question",
                "section_type": "narrative",
                "section_heading": section.heading,
                "source_page_range": [section.start_page, section.end_page],
            }
            if _estimate_tokens(chunk_text) <= max_tokens:
                chunks.append((chunk_text, section.start_page, meta))
            else:
                paragraphs = content.split("\n\n")
                current = ""
                for para in paragraphs:
                    candidate = (current + "\n\n" + para) if current else para
                    full = f"{prefix}{candidate}"
                    if _estimate_tokens(full) > max_tokens and current:
                        chunks.append((f"{prefix}{current}", section.start_page, dict(meta)))
                        current = para
                    else:
                        current = candidate
                if current:
                    chunks.append((f"{prefix}{current}", section.start_page, dict(meta)))
        elif section.type == "speaking_notes":
            content = section.content
            if not content.strip():
                continue
            bullets = re.split(r"(?=⚫)", content)
            bullets = [b.strip() for b in bullets if b.strip()]
            if not bullets:
                bullets = [content]
            prefix = f"[{section.heading}]\n" if section.heading else ""
            for bullet in bullets:
                chunk_text = f"{prefix}{bullet}"
                meta = {
                    "strategy_type": "question",
                    "section_type": "speaking_notes",
                    "section_heading": section.heading,
                    "source_page_range": [section.start_page, section.end_page],
                }
                chunks.append((chunk_text, section.start_page, meta))
        elif section.type == "table":
            content = section.content
            if not content.strip():
                continue
            chunk_text = f"[{section.heading}]\n{content}" if section.heading else content
            meta = {
                "strategy_type": "question",
                "section_type": "table",
                "section_heading": section.heading,
                "answer_contains_table": True,
                "source_page_range": [section.start_page, section.end_page],
            }
            chunks.append((chunk_text, section.start_page, meta))
    return chunks
--- a/backend/app/utils/table_extraction.py
+++ b/backend/app/utils/table_extraction.py
@ -0,0 +1,147 @@
 """Table extraction utilities for Package 8.
 Provides vision-based and text-based table detection and markdown conversion
 for LegCo documents. Uses the existing LLM model (vision-capable) for
 table-to-markdown conversion.
 """
 from __future__ import annotations
 import hashlib
 import json
 import logging
 import os
 from pathlib import Path
 from typing import List, Optional
 logger = logging.getLogger(__name__)
 _CACHE_DIR = Path(__file__).resolve().parent.parent.parent / ".cache" / "vision_tables"
 async def extract_tables_vision(page_images: List[str], llm_client) -> List[str]:
    """Send page images to vision LLM, get back markdown tables.
    Each page_image is a base64-encoded PNG string.
    Uses the existing LLM model which supports vision input.
    """
    results: List[str] = []
    prompt = (
        "Convert this page to Markdown. For any tables:\n"
        "- Use proper markdown table syntax with |---|---| alignment\n"
        "- Preserve all column headers and row labels\n"
        "- Do not modify or translate the content\n"
        "- If a table spans multiple pages, note it"
    )
    for idx, img_b64 in enumerate(page_images):
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{img_b64}"},
                    },
                ],
            }
        ]
        try:
            response = await llm_client._client.chat.completions.create(
                model=llm_client.model,
                messages=messages,
                temperature=0.1,
            )
            content = response.choices[0].message.content or ""
            if content.strip():
                results.append(content.strip())
        except Exception:
            logger.warning("Vision table extraction failed for page image %d", idx, exc_info=True)
    return results
 _TABLE_HEURISTIC_RE = [
    r"(?:\|[\s\-:]+\|)",
    r"(?:\+[-=]+\+)",
    r"(?:(?:\S+\s{2,}){3,}\n)",
 ]
 _TABLE_REGION_PROMPT = (
    "Convert this raw table text extracted from a PDF into a markdown table.\n"
    "Preserve all data exactly. Detect column boundaries and alignment.\n\n"
    "{table_text}"
 )
 async def extract_tables_text(text: str, llm_client) -> List[str]:
    """Detect table-like text regions, send to LLM for markdown conversion."""
    import re
    regions: List[str] = []
    lines = text.split("\n")
    current_region: List[str] = []
    in_table = False
    for line in lines:
        is_table_line = any(re.search(pat, line) for pat in _TABLE_HEURISTIC_RE)
        if is_table_line:
            in_table = True
            current_region.append(line)
        elif in_table and line.strip():
            current_region.append(line)
        else:
            if len(current_region) >= 3:
                regions.append("\n".join(current_region))
            current_region = []
            in_table = False
    if len(current_region) >= 3:
        regions.append("\n".join(current_region))
    if not regions:
        return []
    results: List[str] = []
    for region in regions:
        prompt = _TABLE_REGION_PROMPT.format(table_text=region)
        try:
            response = await llm_client.complete(prompt, temperature=0.1, step_name="TableExtraction")
            if response.strip():
                results.append(response.strip())
        except Exception:
            logger.warning("Text-based table extraction failed", exc_info=True)
    return results
 def inject_tables_into_answer(answer: str, tables_md: List[str]) -> str:
    """Replace raw table text regions in answer with markdown tables."""
    if not tables_md:
        return answer
    result = answer
    for table_md in tables_md:
        lines = table_md.split("\n")
        if not lines:
            continue
        header_line = lines[0]
        if header_line.strip() in result:
            result = result.replace(header_line.strip(), table_md)
    return result
 def cache_vision_result(page_hash: str) -> Optional[str]:
    """Simple disk cache: hash→markdown stored in .cache dir. Returns None on miss."""
    cache_file = _CACHE_DIR / f"{page_hash}.md"
    if cache_file.exists():
        return cache_file.read_text(encoding="utf-8")
    return None
 def save_vision_result(page_hash: str, markdown: str) -> None:
    """Save a vision result to the disk cache."""
    _CACHE_DIR.mkdir(parents=True, exist_ok=True)
    cache_file = _CACHE_DIR / f"{page_hash}.md"
    cache_file.write_text(markdown, encoding="utf-8")
 def compute_page_hash(page_image_b64: str) -> str:
    """Compute a hash for a page image for cache key purposes."""
    return hashlib.sha256(page_image_b64.encode("utf-8")).hexdigest()[:16]
--- a/frontend/src/components/ChunkList.tsx
+++ b/frontend/src/components/ChunkList.tsx
@ -56,9 +56,32 @@ export const ChunkList: React.FC<ChunkListProps> = ({
              <span className="text-xs font-medium text-gray-500 uppercase">
                Chunk {chunk.chunk_index}
              </span>
-              <span className="text-xs text-gray-400">
+              {chunk.strategy_type === 'question' && chunk.question_id ? (
-                Page: {chunk.page_number !== null ? chunk.page_number : 'N/A'}
+                <>
-              </span>
+                  <span className="text-xs text-gray-600">
                    Q: {chunk.question_id}{chunk.question_text ? ` — ${chunk.question_text.length > 80 ? `${chunk.question_text.slice(0, 80)}...` : chunk.question_text}` : ''}
                  </span>
                  {chunk.topic_section && (
                    <span className="text-xs text-gray-500">
                      Topic: {chunk.topic_section}
                    </span>
                  )}
                  {chunk.source_page_range && chunk.source_page_range.length === 2 && (
                    <span className="text-xs text-gray-400">
                      Pages {chunk.source_page_range[0]}-{chunk.source_page_range[1]}
                    </span>
                  )}
                  {chunk.has_table && (
                    <span className="inline-flex items-center px-1.5 py-0.5 rounded text-xs font-medium bg-amber-100 text-amber-700">
                      Contains table
                    </span>
                  )}
                </>
              ) : (
                <span className="text-xs text-gray-400">
                  Page: {chunk.page_number !== null ? chunk.page_number : 'N/A'}
                </span>
              )}
            </div>
            <div className="text-sm text-gray-700 truncate" title={chunk.content_summary}>
              {chunk.content_summary.length > 100
@ -67,7 +90,12 @@ export const ChunkList: React.FC<ChunkListProps> = ({
            </div>
            {chunk.chunk_file_path && (
              <a
-                href={getPdfViewerUrl(chunk.chunk_file_path, chunk.page_number ?? undefined)}
+                href={getPdfViewerUrl(
                  chunk.chunk_file_path,
                  chunk.strategy_type === 'question' && chunk.source_page_range && chunk.source_page_range.length > 0
                    ? chunk.source_page_range[0]
                    : chunk.page_number ?? undefined
                )}
                target="_blank"
                rel="noopener noreferrer"
                className="inline-flex items-center mt-1 text-xs text-blue-600 hover:text-blue-800 hover:underline"
--- a/frontend/src/components/DocumentList.tsx
+++ b/frontend/src/components/DocumentList.tsx
@ -29,7 +29,18 @@ export const DocumentList: React.FC<DocumentListProps> = ({
            <div className="flex items-center space-x-3 flex-1">
              <FileText className="w-5 h-5 text-gray-500 flex-shrink-0" />
              <div className="flex-1 min-w-0">
-                <div className="font-medium text-gray-900 truncate">{doc.filename}</div>
+                <div className="flex items-center space-x-2">
                  <span className="font-medium text-gray-900 truncate">{doc.filename}</span>
                  {doc.chunking_strategy === 'question' ? (
                    <span className="inline-flex items-center px-2 py-0.5 rounded text-xs font-medium bg-blue-100 text-blue-700">
                      chunked by question
                    </span>
                  ) : (
                    <span className="inline-flex items-center px-2 py-0.5 rounded text-xs font-medium bg-gray-100 text-gray-700">
                      chunked by token
                    </span>
                  )}
                </div>
                <div className="text-sm text-gray-500">
                  {doc.chunk_count} chunks • Uploaded {doc.upload_date}
                </div>
--- a/frontend/src/lib/api.ts
+++ b/frontend/src/lib/api.ts
@ -1,5 +1,5 @@
 import axios from 'axios'
-import type { QueryRequest, QueryResponse, QueryStreamEvent, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, FullTranscriptResponse, VideoUploadResponse } from '../types'
+import type { ChunkingStrategy, QueryRequest, QueryResponse, QueryStreamEvent, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, FullTranscriptResponse, VideoUploadResponse } from '../types'
 const BASE_URL: string = import.meta.env.VITE_API_BASE_URL ?? 'http://localhost:8000/api/v1'
@ -48,10 +48,10 @@ export const queryDocumentStream = async (
  }
 }
-export const ingestDocument = async (file: File): Promise<IngestResponse> => {
+export const ingestDocument = async (file: File, strategy: ChunkingStrategy = 'token'): Promise<IngestResponse> => {
  const form = new FormData()
  form.append('file', file)
-  const resp = await apiClient.post<IngestResponse>('/ingest', form, {
+  const resp = await apiClient.post<IngestResponse>(`/ingest?strategy=${strategy}`, form, {
    headers: { 'Content-Type': 'multipart/form-data' },
  })
  return resp.data
--- a/frontend/src/lib/queries.tsx
+++ b/frontend/src/lib/queries.tsx
@ -1,7 +1,7 @@
 import React from 'react'
 import { QueryClient, QueryClientProvider, useMutation, useQuery, useQueryClient } from '@tanstack/react-query'
 import { queryDocument, queryDocumentStream, ingestDocument, listDocuments, listChunks, deleteDocument, deleteChunk, listPromptProfiles, getPromptProfile, activatePromptProfile, updatePrompt, updateAllPrompts, resetPrompts, exportProfile, importProfile, listQueryHistory, getQueryHistoryDetail, deleteQueryHistory, clearQueryHistory, getHistoryStats, uploadVideo } from './api'
-import type { QueryRequest, QueryResponse, QueryStreamEvent, SourceMetadata, SubQuestionSources, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, VideoUploadResponse } from '../types'
+import type { QueryRequest, QueryResponse, QueryStreamEvent, SourceMetadata, SubQuestionSources, ChunkingStrategy, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, VideoUploadResponse } from '../types'
 import { useState, useCallback, useRef } from 'react'
 export const queryClient = new QueryClient()
@ -185,8 +185,8 @@ export const useQueryDocumentStream = () => {
 }
 export const useIngestDocument = () => {
-  return useMutation<IngestResponse, Error, File>({
+  return useMutation<IngestResponse, Error, { file: File; strategy: ChunkingStrategy }>({
-    mutationFn: ingestDocument,
+    mutationFn: ({ file, strategy }) => ingestDocument(file, strategy),
  })
 }
--- a/frontend/src/pages/RAGDatabasePage.tsx
+++ b/frontend/src/pages/RAGDatabasePage.tsx
@ -1,10 +1,11 @@
 import React, { useState, useCallback, useMemo } from 'react'
-import { Database, AlertCircle, CheckCircle, XCircle, Loader2 } from 'lucide-react'
+import { Database, AlertCircle, CheckCircle, XCircle, Loader2, Type, MessageSquare } from 'lucide-react'
 import { useQueryClient } from '@tanstack/react-query'
 import { useDocuments, useDocumentChunks, useDeleteDocument, useDeleteChunk, useIngestDocument } from '../lib/queries'
 import { DocumentList } from '../components/DocumentList'
 import { ChunkList } from '../components/ChunkList'
 import { DocumentUpload } from '../components/DocumentUpload'
 import type { ChunkingStrategy } from '../types'
 interface FileUploadEntry {
  name: string
@ -22,6 +23,7 @@ export const RAGDatabasePage: React.FC = () => {
  const initialDocId = useMemo(() => getDocumentIdFromUrl(), [])
  const [expandedId, setExpandedId] = useState<string | null>(initialDocId)
  const [uploadEntries, setUploadEntries] = useState<FileUploadEntry[]>([])
  const [chunkingStrategy, setChunkingStrategy] = useState<ChunkingStrategy>('token')
  const { data: documentsData, isLoading: isLoadingDocuments, error: documentsError } = useDocuments()
  const { data: chunks, isLoading: isLoadingChunks } = useDocumentChunks(expandedId)
@ -60,7 +62,7 @@ export const RAGDatabasePage: React.FC = () => {
    const results = await Promise.allSettled(
      files.map(async (file) => {
        try {
-          await ingestDocumentMutation.mutateAsync(file)
+          await ingestDocumentMutation.mutateAsync({ file, strategy: chunkingStrategy })
          setUploadEntries((prev) =>
            prev.map((e) =>
              e.name === file.name ? { ...e, status: 'success' as const } : e
@ -80,7 +82,7 @@ export const RAGDatabasePage: React.FC = () => {
    queryClient.invalidateQueries({ queryKey: ['documents'] })
    setTimeout(() => setUploadEntries([]), 5000)
-  }, [ingestDocumentMutation, queryClient])
+  }, [ingestDocumentMutation, queryClient, chunkingStrategy])
  const uploadingCount = uploadEntries.filter((e) => e.status === 'uploading').length
  const successCount = uploadEntries.filter((e) => e.status === 'success').length
@ -127,6 +129,42 @@ export const RAGDatabasePage: React.FC = () => {
          />
        </div>
        <div className="mt-3 flex items-center space-x-4">
          <span className="text-sm font-medium text-gray-700">Chunking strategy:</span>
          <div className="flex items-center space-x-3">
            <label className="flex items-center space-x-2 cursor-pointer">
              <input
                type="radio"
                name="chunking-strategy"
                value="token"
                checked={chunkingStrategy === 'token'}
                onChange={() => setChunkingStrategy('token')}
                className="h-4 w-4 text-blue-600 border-gray-300 focus:ring-blue-500"
              />
              <Type className="w-4 h-4 text-gray-500" />
              <div>
                <span className="text-sm font-medium text-gray-900">Chunk by Token</span>
                <span className="text-xs text-gray-500 ml-1">Split by token windows with overlap</span>
              </div>
            </label>
            <label className="flex items-center space-x-2 cursor-pointer">
              <input
                type="radio"
                name="chunking-strategy"
                value="question"
                checked={chunkingStrategy === 'question'}
                onChange={() => setChunkingStrategy('question')}
                className="h-4 w-4 text-blue-600 border-gray-300 focus:ring-blue-500"
              />
              <MessageSquare className="w-4 h-4 text-gray-500" />
              <div>
                <span className="text-sm font-medium text-gray-900">Chunk by Question</span>
                <span className="text-xs text-gray-500 ml-1">Detect Q&A pairs, extract tables</span>
              </div>
            </label>
          </div>
        </div>
        {hasEntries && (
          <div className="mt-4 space-y-2">
            <div className="text-sm font-medium text-gray-600">