Merge branch 'RAG-workflow'

2026-05-15 13:35:54 +08:00 · 2026-05-15 13:35:54 +08:00 · f637ab10a5
parent c8a9c857f7 9bef65de7b
commit f637ab10a5
21 changed files with 1753 additions and 25 deletions
--- a/.plans/package8_enhancement_plan.md
+++ b/.plans/package8_enhancement_plan.md
@ -327,7 +327,7 @@ For each section in the JSON response:
 If `qa_vision_enabled=false` or for cost optimization, use regex as a fast pre-filter. The LLM call is skipped for documents where regex confidently identifies all boundaries, and used only when:
 - No regex pattern matches (unknown format)
 - Regex produces < 2 sections (likely misdetection)
- `qa_verification_model` is not set to `"none"`
+- `qa_structure_model` is not set to `"none"`

 ### Algorithm Detail: Table-to-Markdown

@ -382,7 +382,7 @@ class Settings(BaseSettings):
    # NEW: Q&A chunking config
    qa_vision_enabled: bool = True   # Toggle vision-based table extraction (uses existing LLM_MODEL_NAME)
    qa_max_chunk_tokens: int = 3000  # Max tokens per Q&A chunk (before forced split)
-    qa_verification_model: str = ""  # LLM for boundary verification (empty = use LLM_MODEL_NAME)
+    qa_structure_model: str = ""  # LLM for structure detection (empty = use LLM_MODEL_NAME)
    qa_include_internal_refs: bool = True  # Include [內部參考] in chunks
    qa_cache_vision_results: bool = True   # Cache vision results per page

@ -390,7 +390,7 @@ class Settings(BaseSettings):
    # DEFAULT_CHUNKING_STRATEGY=token
    # QA_VISION_ENABLED=true
    # QA_MAX_CHUNK_TOKENS=3000
-    # QA_VERIFICATION_MODEL=
+    # QA_STRUCTURE_MODEL=
    # QA_INCLUDE_INTERNAL_REFS=true
    # QA_CACHE_VISION_RESULTS=true

--- a/backend/.env.example
+++ b/backend/.env.example
@ -41,3 +41,11 @@ MAX_VIDEO_SIZE_MB=300
 # Set to false to disable System Audio or Listen Mic capture
 SYSTEM_AUDIO_ENABLED=true
 MIC_ENABLED=true
+
+# Q&A-pair chunking (Package 8)
+DEFAULT_CHUNKING_STRATEGY=token
+QA_VISION_ENABLED=true
+QA_MAX_CHUNK_TOKENS=3000
+QA_STRUCTURE_MODEL=
+QA_INCLUDE_INTERNAL_REFS=true
+QA_CACHE_VISION_RESULTS=true
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@ -44,6 +44,14 @@ class Settings(BaseSettings):
    relevance_threshold: float = 7.0
    llm_timeout: float = 60.0

+    # Q&A-pair chunking strategy (Package 8)
+    default_chunking_strategy: str = "token"
+    qa_vision_enabled: bool = True
+    qa_max_chunk_tokens: int = 3000
+    qa_structure_model: str = ""
+    qa_include_internal_refs: bool = True
+    qa_cache_vision_results: bool = True
+
    # Alibaba Cloud DashScope ASR (Phase 2)
    dashscope_api_key: str = ""
    asr_model_name: str = "qwen3-asr-flash"
--- a/backend/app/models/documents.py
+++ b/backend/app/models/documents.py
@ -8,6 +8,7 @@ class DocumentInfo(BaseModel):
    filename: str
    chunk_count: int
    upload_date: str
+    chunking_strategy: str = "token"


 class ChunkInfo(BaseModel):
@ -16,6 +17,14 @@ class ChunkInfo(BaseModel):
    content_summary: str
    page_number: Optional[int] = None
    chunk_file_path: Optional[str] = None
+    strategy_type: Optional[str] = None
+    question_index: Optional[int] = None
+    question_id: Optional[str] = None
+    question_text: Optional[str] = None
+    section_heading: Optional[str] = None
+    answer_contains_table: Optional[bool] = None
+    source_page_range: Optional[List[int]] = None
+    parent_topic: Optional[str] = None


 class DocumentListResponse(BaseModel):
--- a/backend/app/models/ingest.py
+++ b/backend/app/models/ingest.py
@ -1,7 +1,18 @@
+from typing import Literal
+
 from pydantic import BaseModel

+ChunkingStrategyType = Literal["token", "question"]
+
+VALID_CHUNKING_STRATEGIES = frozenset({"token", "question"})
+
+
+class IngestRequest(BaseModel):
+    strategy: ChunkingStrategyType = "token"
+

 class IngestResponse(BaseModel):
    document_id: str
    chunk_count: int
    filename: str
+    strategy: ChunkingStrategyType = "token"
--- a/backend/app/routers/ingest.py
+++ b/backend/app/routers/ingest.py
@ -5,9 +5,9 @@ import tempfile
 import uuid
 from pathlib import Path

-from fastapi import APIRouter, UploadFile, File, HTTPException
+from fastapi import APIRouter, UploadFile, File, HTTPException, Query

-from app.models.ingest import IngestResponse
+from app.models.ingest import IngestResponse, VALID_CHUNKING_STRATEGIES

 logger = logging.getLogger(__name__)
 router = APIRouter(tags=["ingest"])
@ -37,11 +37,14 @@ def _delete_existing_document(rag, filename: str, chunk_dir: str) -> None:


@router.post("/ingest", response_model=IngestResponse)
-async def ingest_document(file: UploadFile = File(...)):
+async def ingest_document(
+    file: UploadFile = File(...),
+    strategy: str = Query("token"),
+):
    """Ingest a document into the RAG system."""
    from app.core.config import get_settings
    from app.services.rag import RAGService
-    from app.utils.chunking import TokenChunkingStrategy
+    from app.utils.chunking import get_chunking_strategy
    from app.utils.metadata import extract_metadata

    filename = file.filename or "unknown"
@ -53,6 +56,12 @@ async def ingest_document(file: UploadFile = File(...)):
            detail=f"Unsupported file format: {file_ext}. Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}",
        )

+    if strategy not in VALID_CHUNKING_STRATEGIES:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Invalid chunking strategy: {strategy}. Valid: {', '.join(sorted(VALID_CHUNKING_STRATEGIES))}",
+        )
+
    settings = get_settings()
    temp_path = None
    try:
@ -68,9 +77,7 @@ async def ingest_document(file: UploadFile = File(...)):
        _delete_existing_document(rag, filename, chunk_dir)

        document_id = str(uuid.uuid4())
-        chunker = TokenChunkingStrategy(
-            chunk_size=settings.chunk_size, overlap=settings.chunk_overlap
-        )
+        chunker = get_chunking_strategy(strategy, settings)

        if file_ext == ".pdf":
            from app.utils.pdf_parser import parse_pdf_by_page
@ -105,6 +112,8 @@ async def ingest_document(file: UploadFile = File(...)):
                    )
                    chunk_file_paths.append(None)

+            chunk_metadata = chunker._chunk_metadata if hasattr(chunker, '_chunk_metadata') else None
+
            metadata = extract_metadata(
                temp_path,
                chunk_texts,
@ -112,6 +121,8 @@ async def ingest_document(file: UploadFile = File(...)):
                page_numbers=page_numbers,
                chunk_file_paths=chunk_file_paths,
                document_id=document_id,
+                strategy_type=strategy,
+                chunk_metadata=chunk_metadata,
            )

            rag.ingest_document(temp_path, chunk_texts, metadata, document_id=document_id)
@ -145,9 +156,12 @@ async def ingest_document(file: UploadFile = File(...)):
                    )
                    chunk_file_paths.append(None)

+            chunk_metadata = chunker._chunk_metadata if hasattr(chunker, '_chunk_metadata') else None
+
            metadata = extract_metadata(
                temp_path, chunks, original_filename=filename,
                chunk_file_paths=chunk_file_paths, document_id=document_id,
+                strategy_type=strategy, chunk_metadata=chunk_metadata,
            )
            rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)

@ -180,9 +194,12 @@ async def ingest_document(file: UploadFile = File(...)):
                    )
                    chunk_file_paths.append(None)

+            chunk_metadata = chunker._chunk_metadata if hasattr(chunker, '_chunk_metadata') else None
+
            metadata = extract_metadata(
                temp_path, chunks, original_filename=filename,
                chunk_file_paths=chunk_file_paths, document_id=document_id,
+                strategy_type=strategy, chunk_metadata=chunk_metadata,
            )
            rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)

@ -193,6 +210,7 @@ async def ingest_document(file: UploadFile = File(...)):
            document_id=document_id,
            chunk_count=chunk_count,
            filename=filename,
+            strategy=strategy,
        )

    except HTTPException:
--- a/backend/app/test/acceptance/test_acceptance_phase8_qa_chunking.py
+++ b/backend/app/test/acceptance/test_acceptance_phase8_qa_chunking.py
@ -0,0 +1,60 @@
+"""Acceptance tests: Phase 8 Q&A-pair chunking with real LTT PDFs.
+
+Prerequisites:
+- ChromaDB running (local)
+- .env configured with valid LLM_BASE_URL, LLM_API_KEY, LLM_MODEL_NAME
+- Test PDFs available in ../../test materials/LTT/
+
+These tests require real LLM calls and actual LegCo PDFs.
+Run manually: pytest app/test/acceptance/test_acceptance_phase8_qa_chunking.py -v -m acceptance
+"""
+import os
+import sys
+
+import pytest
+
+
+@pytest.mark.acceptance
+@pytest.mark.slow
+class TestRealQaChunking:
+    """End-to-end Q&A chunking with real LegCo PDFs from test materials/LTT/."""
+
+    @pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
+    def test_real_qa_chunking_fileE(self):
+        """File E produces 12 Chinese Q&A pairs + 3 Others + narrative sections."""
+        pass
+
+    @pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
+    def test_real_qa_chunking_fileL(self):
+        """File L produces 24 English Q&A pairs + narrative sections."""
+        pass
+
+    @pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
+    def test_real_qa_chunking_fileB(self):
+        """File B produces 3 Chinese Q&A pairs + narrative sections."""
+        pass
+
+    @pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
+    def test_real_qa_chunking_fileA(self):
+        """File A falls back to narrative chunking (no Q&A, should not error)."""
+        pass
+
+    @pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
+    def test_table_extraction_fileE(self):
+        """Tables in File E answers converted to markdown."""
+        pass
+
+    @pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
+    def test_table_extraction_fileL(self):
+        """Tables in File L answers converted to markdown."""
+        pass
+
+    @pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
+    def test_qa_page_references(self):
+        """Each Q&A chunk's page number points to question (問) location."""
+        pass
+
+    @pytest.mark.skip(reason="Requires full pipeline with LLM, embeddings, ChromaDB")
+    def test_full_pipeline_question_strategy(self):
+        """Full ingest -> retrieve -> query pipeline with Q&A chunks."""
+        pass
--- a/backend/app/test/test_phase1_config.py
+++ b/backend/app/test/test_phase1_config.py
@ -31,3 +31,47 @@ def test_config_default_values(monkeypatch):
    settings = Settings()
    assert settings.llm_base_url == "https://openrouter.ai/api/v1"
    assert settings.llm_model_name == "qwen/qwen3.5-35b-a3b"
+
+
+def test_qa_chunking_config_defaults(monkeypatch):
+    """Phase 8.0: Q&A chunking config fields have correct defaults."""
+    monkeypatch.delenv("DEFAULT_CHUNKING_STRATEGY", raising=False)
+    monkeypatch.delenv("QA_VISION_ENABLED", raising=False)
+    monkeypatch.delenv("QA_MAX_CHUNK_TOKENS", raising=False)
+    monkeypatch.delenv("QA_STRUCTURE_MODEL", raising=False)
+    monkeypatch.delenv("QA_INCLUDE_INTERNAL_REFS", raising=False)
+    monkeypatch.delenv("QA_CACHE_VISION_RESULTS", raising=False)
+
+    from app.core.config import Settings
+
+    settings = Settings()
+    assert settings.default_chunking_strategy == "token"
+    assert settings.qa_vision_enabled is True
+    assert settings.qa_max_chunk_tokens == 3000
+    assert settings.qa_structure_model == ""
+    assert settings.qa_include_internal_refs is True
+    assert settings.qa_cache_vision_results is True
+
+
+def test_qa_chunking_config_from_env(tmp_path, monkeypatch):
+    """Phase 8.0: Q&A chunking config fields load from .env."""
+    env_file = tmp_path / ".env"
+    env_file.write_text(
+        "DEFAULT_CHUNKING_STRATEGY=question\n"
+        "QA_VISION_ENABLED=false\n"
+        "QA_MAX_CHUNK_TOKENS=5000\n"
+        "QA_STRUCTURE_MODEL=anthropic/claude-3-haiku\n"
+        "QA_INCLUDE_INTERNAL_REFS=false\n"
+        "QA_CACHE_VISION_RESULTS=false\n"
+    )
+
+    monkeypatch.chdir(tmp_path)
+    from app.core.config import Settings
+
+    settings = Settings()
+    assert settings.default_chunking_strategy == "question"
+    assert settings.qa_vision_enabled is False
+    assert settings.qa_max_chunk_tokens == 5000
+    assert settings.qa_structure_model == "anthropic/claude-3-haiku"
+    assert settings.qa_include_internal_refs is False
+    assert settings.qa_cache_vision_results is False
--- a/backend/app/test/test_phase8_ingest.py
+++ b/backend/app/test/test_phase8_ingest.py
@ -0,0 +1,209 @@
+"""Phase 8 tests: Ingest API integration with strategy selection (Sub-Phase 8.3).
+
+Covers:
+- POST /api/v1/api/v1/ingest?strategy=token — existing behavior unchanged
+- POST /api/v1/api/v1/ingest?strategy=question — Q&A chunking applied
+- Invalid strategy values return 400
+- IngestResponse includes strategy field
+- DOCX with Q&A format uses question strategy
+- Document without Q&A falls back gracefully
+"""
+import io
+import json
+from typing import List, Tuple
+from unittest.mock import MagicMock
+
+import pytest
+from fastapi import FastAPI
+from fastapi.testclient import TestClient
+from pypdf import PdfWriter
+
+from app.routers.ingest import router
+
+
+class _DeterministicEmbedding:
+    def name(self) -> str:
+        return "test_deterministic"
+
+    def __call__(self, input):
+        return self._embed(input)
+
+    def embed_query(self, input):
+        return self._embed(input)
+
+    @staticmethod
+    def _embed(texts):
+        vectors = []
+        for text in texts:
+            vec = [0.0] * 384
+            for i, ch in enumerate(text[:384]):
+                vec[i] = ord(ch) / 1000.0
+            vectors.append(vec)
+        return vectors
+
+
+def _create_real_pdf(content: str) -> bytes:
+    writer = PdfWriter()
+    writer.add_blank_page(width=200, height=200)
+    buf = io.BytesIO()
+    writer.write(buf)
+    return buf.getvalue()
+
+
+def _create_text_txt(content: str) -> bytes:
+    return content.encode("utf-8")
+
+
+@pytest.fixture
+def client(tmp_path, monkeypatch):
+    """TestClient with real ChromaDB isolated in tmp_path + deterministic embeddings."""
+    chroma_path = str(tmp_path / "chroma_db")
+    chunk_path = str(tmp_path / "document_chunk")
+    prompts_path = str(tmp_path / "prompts.db")
+    history_path = str(tmp_path / "history.db")
+
+    monkeypatch.setenv("CHROMA_DB_PATH", chroma_path)
+    monkeypatch.setenv("DOCUMENT_CHUNK_PATH", chunk_path)
+    monkeypatch.setenv("PROMPTS_DB_PATH", prompts_path)
+    monkeypatch.setenv("HISTORY_DB_PATH", history_path)
+    monkeypatch.setenv("EMBEDDING_MODEL", "test-mock")
+    monkeypatch.setenv("LLM_API_KEY", "test-key")
+
+    from app.core.config import get_settings
+    get_settings.cache_clear()
+    from app.core.dependencies import get_settings_cached
+    get_settings_cached.cache_clear()
+
+    from app.core.sqlite_db import _get_db, init_prompts_db, init_history_db, seed_default_profiles
+    conn = _get_db(prompts_path)
+    init_prompts_db(conn)
+    seed_default_profiles(conn)
+    conn.close()
+
+    hconn = _get_db(history_path)
+    init_history_db(hconn)
+    hconn.close()
+
+    monkeypatch.setattr(
+        "app.core.database.get_embedding_function_settings",
+        lambda settings: _DeterministicEmbedding(),
+    )
+
+    test_app = FastAPI()
+    test_app.include_router(router, prefix="/api/v1")
+
+    yield TestClient(test_app)
+
+    get_settings_cached.cache_clear()
+    get_settings.cache_clear()
+
+
+def test_ingest_with_strategy_token(client):
+    """Existing behavior unchanged: strategy=token uses TokenChunkingStrategy."""
+    txt_bytes = _create_text_txt("This is a test document with enough content to generate chunks.")
+    resp = client.post(
+        "/api/v1/ingest?strategy=token",
+        files={"file": ("test.txt", txt_bytes, "text/plain")},
+    )
+    assert resp.status_code == 200
+    data = resp.json()
+    assert data["strategy"] == "token"
+    assert data["chunk_count"] > 0
+
+
+def test_ingest_invalid_strategy_rejected(client):
+    """Invalid strategy values return 400."""
+    txt_bytes = _create_text_txt("test")
+    resp = client.post(
+        "/api/v1/ingest?strategy=invalid",
+        files={"file": ("test.txt", txt_bytes, "text/plain")},
+    )
+    assert resp.status_code == 400
+    assert "strategy" in resp.json()["detail"].lower()
+
+
+def test_ingest_response_includes_strategy(client):
+    """IngestResponse includes the strategy field."""
+    txt_bytes = _create_text_txt("Strategy response test content with more text to ensure chunks.")
+    resp = client.post(
+        "/api/v1/ingest?strategy=token",
+        files={"file": ("test.txt", txt_bytes, "text/plain")},
+    )
+    assert resp.status_code == 200
+    assert "strategy" in resp.json()
+
+
+def test_ingest_default_strategy_is_token(client):
+    """When no strategy param provided, default to token."""
+    txt_bytes = _create_text_txt("Default strategy test with enough text to generate output.")
+    resp = client.post(
+        "/api/v1/ingest",
+        files={"file": ("test.txt", txt_bytes, "text/plain")},
+    )
+    assert resp.status_code == 200
+    assert resp.json()["strategy"] == "token"
+
+
+def test_ingest_question_strategy_txt(client, monkeypatch):
+    """TXT with Q&A format uses question strategy and produces chunks."""
+    _mock_question_chunker(monkeypatch)
+
+    txt_bytes = _create_text_txt("問A1：test question\n答A1：test answer with more text here to ensure chunking works properly.")
+
+    resp = client.post(
+        "/api/v1/ingest?strategy=question",
+        files={"file": ("test.txt", txt_bytes, "text/plain")},
+    )
+    assert resp.status_code == 200
+    data = resp.json()
+    assert data["strategy"] == "question"
+    assert data["chunk_count"] > 0
+
+
+def test_ingest_question_strategy_no_qa_fallback(client, monkeypatch):
+    """Document without Q&A markers falls back to narrative chunking without error."""
+    _mock_question_chunker(monkeypatch)
+
+    txt_bytes = _create_text_txt("This is plain text without any Q&A markers, but it needs to be long enough to generate at least one chunk when processed by the tokenizer.")
+
+    resp = client.post(
+        "/api/v1/ingest?strategy=question",
+        files={"file": ("plain.txt", txt_bytes, "text/plain")},
+    )
+    assert resp.status_code == 200
+    data = resp.json()
+    assert data["strategy"] == "question"
+    assert data["chunk_count"] > 0
+
+
+def _mock_question_chunker(monkeypatch):
+    """Replace QuestionChunkingStrategy with a mock that returns test chunks."""
+
+    class _MockQuestionChunker:
+        def __init__(self, settings=None, llm_client=None):
+            self._chunk_metadata = [
+                {
+                    "strategy_type": "question",
+                    "section_type": "qa",
+                    "question_index": 0,
+                    "question_id": "A1",
+                    "question_text": "What is X?",
+                    "section_heading": "(A) Topic",
+                    "answer_contains_table": False,
+                    "source_page_range": [1, 2],
+                }
+            ]
+            self._max_tokens = 3000
+
+        def chunk(self, text):
+            self._chunk_metadata = self._chunk_metadata[:1]
+            return ["Question: What is X?\n\nAnswer: X is Y."]
+
+        def chunk_pages(self, pages, overlap_tokens=0):
+            self._chunk_metadata = self._chunk_metadata[:1]
+            return [("Question: What is X?\n\nAnswer: X is Y.", 1)]
+
+    monkeypatch.setattr(
+        "app.utils.chunking.QuestionChunkingStrategy",
+        _MockQuestionChunker,
+    )
--- a/backend/app/test/test_phase8_metadata.py
+++ b/backend/app/test/test_phase8_metadata.py
@ -0,0 +1,149 @@
+"""Phase 8 tests: Q&A metadata enrichment (Sub-Phase 8.2).
+
+Covers:
+- Metadata enrichment with Q&A-specific fields via chunk_metadata param
+- Backward compatibility: token strategy unchanged
+- Page number references question location
+- Chunk metadata merging with base metadata
+"""
+import json
+
+import pytest
+
+from app.utils.metadata import extract_metadata
+
+
+def test_qa_metadata_fields(tmp_path):
+    """strategy_type, question_index, question_id, question_text merged via chunk_metadata."""
+    file_path = tmp_path / "test.pdf"
+    file_path.write_text("dummy content")
+
+    chunks = ["chunk 1", "chunk 2"]
+    chunk_metadata = [
+        {
+            "strategy_type": "question",
+            "section_type": "qa",
+            "question_index": 0,
+            "question_id": "A1",
+            "question_text": "What is X?",
+            "section_heading": "(A) Section",
+            "answer_contains_table": True,
+            "source_page_range": [2, 5],
+            "parent_topic": "Topic Name",
+        },
+        {
+            "strategy_type": "question",
+            "section_type": "qa",
+            "question_index": 1,
+            "question_id": "A2",
+            "question_text": "What is Y?",
+            "section_heading": "(A) Section",
+            "answer_contains_table": False,
+            "source_page_range": [5, 7],
+        },
+    ]
+
+    metadata = extract_metadata(
+        file_path=str(file_path),
+        chunks=chunks,
+        strategy_type="question",
+        chunk_metadata=chunk_metadata,
+    )
+    assert len(metadata) == 2
+
+    m0 = metadata[0]
+    assert m0["strategy_type"] == "question"
+    assert m0["section_type"] == "qa"
+    assert m0["question_index"] == 0
+    assert m0["question_id"] == "A1"
+    assert m0["question_text"] == "What is X?"
+    assert m0["section_heading"] == "(A) Section"
+    assert m0["answer_contains_table"] is True
+    assert m0["source_page_range"] == [2, 5]
+    assert m0["parent_topic"] == "Topic Name"
+
+    m1 = metadata[1]
+    assert m1["question_index"] == 1
+    assert m1["question_id"] == "A2"
+    assert m1["answer_contains_table"] is False
+
+
+def test_qa_metadata_topic_section(tmp_path):
+    """section_heading and parent_topic are both preserved."""
+    file_path = tmp_path / "test.pdf"
+    file_path.write_text("dummy content")
+
+    metadata = extract_metadata(
+        file_path=str(file_path),
+        chunks=["chunk"],
+        strategy_type="question",
+        chunk_metadata=[{"section_heading": "(B) Traffic", "parent_topic": "Traffic Planning"}],
+    )
+    assert metadata[0]["section_heading"] == "(B) Traffic"
+    assert metadata[0]["parent_topic"] == "Traffic Planning"
+
+
+def test_token_metadata_unchanged(tmp_path):
+    """Existing metadata fields unchanged for token strategy (no chunk_metadata)."""
+    file_path = tmp_path / "test.txt"
+    file_path.write_text("test content")
+
+    metadata = extract_metadata(
+        file_path=str(file_path),
+        chunks=["chunk 1", "chunk 2"],
+        original_filename="original.txt",
+        strategy_type="token",
+    )
+    assert len(metadata) == 2
+    for m in metadata:
+        assert "filename" in m
+        assert "upload_date" in m
+        assert "content_summary" in m
+        assert "chunk_index" in m
+        assert m.get("strategy_type", "token") == "token"
+        assert "question_id" not in m
+
+
+def test_page_number_from_question(tmp_path):
+    """Page ref should point to question location (pass via page_numbers from strategy)."""
+    file_path = tmp_path / "test.pdf"
+    file_path.write_text("dummy content")
+
+    metadata = extract_metadata(
+        file_path=str(file_path),
+        chunks=["question chunk"],
+        page_numbers=[3],
+        strategy_type="question",
+        chunk_metadata=[{
+            "question_id": "A1",
+            "source_page_range": [3, 8],
+        }],
+    )
+    assert metadata[0]["page_number"] == 3
+    assert metadata[0]["source_page_range"] == [3, 8]
+
+
+def test_chunk_metadata_length_mismatch(tmp_path):
+    """chunk_metadata length mismatch with chunks raises ValueError."""
+    file_path = tmp_path / "test.pdf"
+    file_path.write_text("dummy content")
+
+    with pytest.raises(ValueError, match="chunk_metadata length"):
+        extract_metadata(
+            file_path=str(file_path),
+            chunks=["a", "b", "c"],
+            chunk_metadata=[{}, {}],
+        )
+
+
+def test_chunk_metadata_empty_no_error(tmp_path):
+    """Empty chunk_metadata list with matching chunks is valid."""
+    file_path = tmp_path / "test.pdf"
+    file_path.write_text("dummy content")
+
+    metadata = extract_metadata(
+        file_path=str(file_path),
+        chunks=["a"],
+        chunk_metadata=[],
+    )
+    assert len(metadata) == 1
--- a/backend/app/test/test_phase8_qa_chunking.py
+++ b/backend/app/test/test_phase8_qa_chunking.py
@ -0,0 +1,481 @@
+"""Phase 8 tests: Q&A-pair chunking strategy (Sub-Phase 8.1).
+
+Covers:
+- LLM structure detection response parsing (parse_llm_structure_response)
+- Mixed format handling (問/答 + section headings)
+- Narrative-only text (no Q&A format)
+- Speaking notes (發言要點) chunking by bullet
+- Regex fast-pass for Chinese 問/答 format
+- Regex fast-pass for English Q1/Q2 format
+- Multi-page section tracking with [PAGE_BREAK] markers
+- ChunkingStrategy ABC compliance
+- Page number references question (問) page, not answer
+- Size limit: oversized sections recursively split with heading preserved
+- build_chunks_from_sections output verification
+- preprocess_text: footer stripping, colon normalization, page break insertion
+"""
+
+import json
+from typing import List, Tuple
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from app.utils.qa_chunking import (
+    Section,
+    preprocess_text,
+    build_structure_detection_prompt,
+    parse_llm_structure_response,
+    split_chinese_qa,
+    split_english_qa,
+    build_chunks_from_sections,
+)
+from app.utils.chunking import (
+    ChunkingStrategy,
+    QuestionChunkingStrategy,
+    get_chunking_strategy,
+)
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+@pytest.fixture
+def mock_settings():
+    """Minimal Settings mock with Q&A chunking defaults."""
+    s = MagicMock()
+    s.default_chunking_strategy = "question"
+    s.qa_vision_enabled = False
+    s.qa_max_chunk_tokens = 3000
+    s.qa_structure_model = ""
+    s.qa_include_internal_refs = True
+    s.qa_cache_vision_results = True
+    s.chunk_size = 1000
+    s.chunk_overlap = 200
+    s.llm_model_name = "test-model"
+    s.llm_api_key = "test-key"
+    s.llm_base_url = "https://example.com/v1"
+    s.llm_timeout = 30.0
+    s.llm_enable_thinking = False
+    s.vllm_engine = False
+    return s
+
+
+SAMPLE_LLM_RESPONSE = json.dumps({
+    "sections": [
+        {
+            "type": "qa",
+            "heading": "(A) 排水系統",
+            "qa_id": "A1",
+            "question": "古洞北和粉嶺北新發展區的設計是否能抵禦氣候變化？",
+            "answer": "研究顧問已為古洞北和粉嶺北新發展區研究範圍進行了評估。",
+            "start_page": 2,
+            "end_page": 3,
+            "has_table": False,
+            "parent_topic": "排水系統",
+        },
+        {
+            "type": "narrative",
+            "heading": "(1) 住戶的安置補償",
+            "content": "合資格住戶可選擇(i)「須通過經濟狀況審查」安置安排。",
+            "start_page": 2,
+            "end_page": 5,
+            "has_table": False,
+        },
+        {
+            "type": "speaking_notes",
+            "heading": "發言要點",
+            "content": "⚫ 古洞北／粉嶺北新發展區是北部都會區內的新市鎮發展項目\n⚫ 第一期發展預計於2030年完成",
+            "start_page": 1,
+            "end_page": 2,
+            "has_table": False,
+        },
+    ]
+})
+
+
+# ---------------------------------------------------------------------------
+# Test: LLM structure detection parsing
+# ---------------------------------------------------------------------------
+
+class TestLLMStructureDetection:
+
+    def test_llm_structure_detection(self):
+        """parse_llm_structure_response correctly parses LLM JSON with qa/narrative/speaking_notes."""
+        sections = parse_llm_structure_response(SAMPLE_LLM_RESPONSE)
+        assert len(sections) == 3
+
+        qa = sections[0]
+        assert qa.type == "qa"
+        assert qa.qa_id == "A1"
+        assert qa.question == "古洞北和粉嶺北新發展區的設計是否能抵禦氣候變化？"
+        assert qa.answer == "研究顧問已為古洞北和粉嶺北新發展區研究範圍進行了評估。"
+        assert qa.start_page == 2
+        assert qa.end_page == 3
+        assert qa.heading == "(A) 排水系統"
+        assert qa.parent_topic == "排水系統"
+
+        narr = sections[1]
+        assert narr.type == "narrative"
+        assert narr.heading == "(1) 住戶的安置補償"
+        assert "合資格住戶" in narr.content
+
+        notes = sections[2]
+        assert notes.type == "speaking_notes"
+        assert "⚫" in notes.content
+
+    def test_llm_handles_mixed_formats(self):
+        """Document with 問/答 markers + section headings correctly classified."""
+        mixed_json = json.dumps({
+            "sections": [
+                {
+                    "type": "qa",
+                    "heading": "(B) 交通",
+                    "qa_id": "B1",
+                    "question": "新建道路何時通車？",
+                    "answer": "預計2027年通車。",
+                    "start_page": 3,
+                    "end_page": 4,
+                    "has_table": False,
+                },
+                {
+                    "type": "narrative",
+                    "heading": "背景",
+                    "content": "本文件說明交通規劃。",
+                    "start_page": 1,
+                    "end_page": 2,
+                    "has_table": False,
+                },
+            ]
+        })
+        sections = parse_llm_structure_response(mixed_json)
+        assert len(sections) == 2
+        assert sections[0].type == "qa"
+        assert sections[1].type == "narrative"
+
+    def test_llm_handles_no_qa_format(self):
+        """Narrative-only text (like File L pages 1-13) produces only narrative sections."""
+        narrative_json = json.dumps({
+            "sections": [
+                {
+                    "type": "narrative",
+                    "heading": "Introduction",
+                    "content": "This document provides background on policy matters.",
+                    "start_page": 1,
+                    "end_page": 5,
+                    "has_table": False,
+                },
+                {
+                    "type": "narrative",
+                    "heading": "Analysis",
+                    "content": "The analysis covers multiple dimensions.",
+                    "start_page": 5,
+                    "end_page": 13,
+                    "has_table": False,
+                },
+            ]
+        })
+        sections = parse_llm_structure_response(narrative_json)
+        assert len(sections) == 2
+        assert all(s.type == "narrative" for s in sections)
+
+    def test_llm_handles_speaking_notes(self):
+        """發言要點 text with bullet points produces speaking_notes sections."""
+        notes_json = json.dumps({
+            "sections": [
+                {
+                    "type": "speaking_notes",
+                    "heading": "發言要點",
+                    "content": "⚫ 要點一：政策方向\n⚫ 要點二：實施計劃\n⚫ 要點三：預算安排",
+                    "start_page": 1,
+                    "end_page": 2,
+                    "has_table": False,
+                },
+            ]
+        })
+        sections = parse_llm_structure_response(notes_json)
+        assert len(sections) == 1
+        assert sections[0].type == "speaking_notes"
+        assert sections[0].content.count("⚫") == 3
+
+    def test_parse_markdown_fenced_json(self):
+        """parse_llm_structure_response handles ```json ... ``` wrapped responses."""
+        fenced = '```json\n' + SAMPLE_LLM_RESPONSE + '\n```'
+        sections = parse_llm_structure_response(fenced)
+        assert len(sections) == 3
+
+    def test_parse_invalid_json_raises(self):
+        """parse_llm_structure_response raises ValueError on non-JSON input."""
+        with pytest.raises(ValueError, match="Invalid JSON"):
+            parse_llm_structure_response("this is not json")
+
+
+# ---------------------------------------------------------------------------
+# Test: Regex fast-pass
+# ---------------------------------------------------------------------------
+
+class TestRegexFastPass:
+
+    def test_regex_fastpass_chinese(self):
+        """Text with 問B1/答B1 markers detected by split_chinese_qa without LLM."""
+        text = (
+            "(A) 排水系統\n"
+            "問 B1：古洞北的設計是否能抵禦氣候變化？\n"
+            "答 B1：研究顧問已為古洞北新發展區進行了評估。\n"
+            "問 B2：第二個問題是什麼？\n"
+            "答 B2：這是第二個問題的答案。\n"
+        )
+        sections = split_chinese_qa(text)
+        assert len(sections) >= 2
+        # All should be QA type
+        assert all(s.type == "qa" for s in sections)
+        # First should have question containing 古洞北
+        assert "古洞北" in sections[0].question
+
+    def test_regex_fastpass_chinese_no_match(self):
+        """split_chinese_qa returns empty list when no markers found."""
+        text = "This is plain text without any Q&A markers."
+        assert split_chinese_qa(text) == []
+
+    def test_regex_fastpass_english(self):
+        """Text with Q1, Q2 markers detected by split_english_qa without LLM."""
+        text = (
+            "Background information here.\n\n"
+            "Q1 What is the timeline for the project?\n"
+            "The project is expected to complete by 2027.\n"
+            "Q2 How much will it cost?\n"
+            "The estimated cost is HK$500 million.\n"
+        )
+        sections = split_english_qa(text)
+        assert len(sections) >= 2
+        assert all(s.type == "qa" for s in sections)
+        assert any("timeline" in (s.question or "").lower() for s in sections)
+
+    def test_regex_fastpass_english_no_match(self):
+        """split_english_qa returns empty list when no markers found."""
+        text = "純中文文本沒有英文問答標記。"
+        assert split_english_qa(text) == []
+
+
+# ---------------------------------------------------------------------------
+# Test: Multi-page tracking
+# ---------------------------------------------------------------------------
+
+class TestMultiPage:
+
+    def test_multi_page_sections(self):
+        """Sections with [PAGE_BREAK: N] markers spanning pages track correctly."""
+        pages = [
+            (1, "Header line\n(A) Water drainage\nSome intro text"),
+            (2, "More drainage info\nFooter text X-1"),
+            (3, "New section begins\n(B) Traffic planning"),
+        ]
+        text = preprocess_text(pages)
+        # Should have page break markers
+        assert "[PAGE_BREAK: 1]" in text
+        assert "[PAGE_BREAK: 2]" in text
+        assert "[PAGE_BREAK: 3]" in text
+
+
+# ---------------------------------------------------------------------------
+# Test: ABC contract
+# ---------------------------------------------------------------------------
+
+class TestABCContract:
+
+    def test_abc_contract(self):
+        """QuestionChunkingStrategy satisfies ChunkingStrategy ABC."""
+        mock_settings = MagicMock()
+        mock_settings.qa_max_chunk_tokens = 3000
+        mock_settings.qa_include_internal_refs = True
+        strategy = QuestionChunkingStrategy(settings=mock_settings)
+        assert isinstance(strategy, ChunkingStrategy)
+
+    def test_get_chunking_strategy_factory(self, mock_settings):
+        """get_chunking_strategy returns correct strategy type."""
+        token_strat = get_chunking_strategy("token", mock_settings)
+        assert isinstance(token_strat, ChunkingStrategy)
+
+        q_strat = get_chunking_strategy("question", mock_settings)
+        assert isinstance(q_strat, QuestionChunkingStrategy)
+
+
+# ---------------------------------------------------------------------------
+# Test: Page number reference
+# ---------------------------------------------------------------------------
+
+class TestPageNumberReference:
+
+    def test_page_number_reference_question(self):
+        """Page ref in metadata points to question (問) page, not answer page."""
+        sections = [
+            Section(
+                type="qa",
+                heading="(A) Topic",
+                qa_id="A1",
+                question="What is X?",
+                answer="X is Y.",
+                start_page=5,
+                end_page=8,
+            ),
+        ]
+        chunks = build_chunks_from_sections(sections)
+        assert len(chunks) == 1
+        chunk_text, page_num, metadata = chunks[0]
+        # Page number should be start_page (question location)
+        assert page_num == 5
+        assert metadata.get("source_page_range") == [5, 8]
+
+
+# ---------------------------------------------------------------------------
+# Test: Size limit recursive split
+# ---------------------------------------------------------------------------
+
+class TestSizeLimit:
+
+    def test_size_limit(self):
+        """Oversized QA section > 3000 tokens gets recursively split with question prepended."""
+        # Create a QA pair with a very long answer
+        long_answer = "\n\n".join(f"Paragraph {i}: " + "x" * 200 for i in range(80))
+        sections = [
+            Section(
+                type="qa",
+                heading="(A) Topic",
+                qa_id="A1",
+                question="What is the detailed plan?",
+                answer=long_answer,
+                start_page=2,
+                end_page=5,
+                has_table=False,
+            ),
+        ]
+        # Use a small max_tokens to force splitting
+        chunks = build_chunks_from_sections(sections, max_tokens=500)
+        assert len(chunks) > 1
+        # Each chunk should have the question text prepended
+        for chunk_text, page_num, metadata in chunks:
+            assert "What is the detailed plan?" in chunk_text
+            # Page number should always be the question page
+            assert page_num == 2
+
+
+# ---------------------------------------------------------------------------
+# Test: build_chunks_from_sections
+# ---------------------------------------------------------------------------
+
+class TestBuildChunksFromSections:
+
+    def test_build_chunks_from_sections(self):
+        """Verify chunk texts and metadata from sections list."""
+        sections = [
+            Section(
+                type="qa",
+                heading="(A) 排水系統",
+                qa_id="A1",
+                question="古洞北的設計是否能抵禦氣候變化？",
+                answer="研究顧問已為古洞北進行了評估。",
+                start_page=2,
+                end_page=3,
+                has_table=True,
+                parent_topic="排水系統",
+            ),
+            Section(
+                type="narrative",
+                heading="(1) 住戶的安置補償",
+                content="合資格住戶可選擇安置安排。",
+                start_page=3,
+                end_page=5,
+                has_table=False,
+            ),
+            Section(
+                type="speaking_notes",
+                heading="發言要點",
+                content="⚫ 要點一：政策方向\n⚫ 要點二：實施計劃",
+                start_page=1,
+                end_page=1,
+                has_table=False,
+            ),
+            Section(
+                type="toc",
+                heading="目錄",
+                content="Page 1 ... Page 2",
+                start_page=1,
+                end_page=1,
+                has_table=False,
+            ),
+        ]
+        chunks = build_chunks_from_sections(sections)
+        # Should have: 1 QA + 1 narrative + 2 speaking_notes bullets + 0 toc = 4
+        assert len(chunks) >= 4
+
+        # First chunk: QA
+        qa_text, qa_page, qa_meta = chunks[0]
+        assert "古洞北" in qa_text
+        assert qa_page == 2
+        assert qa_meta["section_type"] == "qa"
+        assert qa_meta["question_id"] == "A1"
+        assert qa_meta["question_index"] == 0
+        assert qa_meta["answer_contains_table"] is True
+        assert qa_meta["section_heading"] == "(A) 排水系統"
+
+        # Find the narrative chunk
+        narr_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "narrative"]
+        assert len(narr_chunks) == 1
+        narr_text, narr_page, narr_meta = narr_chunks[0]
+        assert "住戶的安置補償" in narr_text
+        assert "合資格住戶" in narr_text
+
+        # Find speaking_notes chunks
+        notes_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "speaking_notes"]
+        assert len(notes_chunks) == 2
+        for t, p, m in notes_chunks:
+            assert "要點" in t
+
+        # No TOC chunks
+        toc_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "toc"]
+        assert len(toc_chunks) == 0
+
+
+# ---------------------------------------------------------------------------
+# Test: preprocess_text
+# ---------------------------------------------------------------------------
+
+class TestPreprocessText:
+
+    def test_preprocess_text(self):
+        """Footer markers stripped, colons normalized, page breaks inserted."""
+        pages = [
+            (1, "Header\n(A) Section Title\nX-1\n2024-01-15"),
+            (2, "Content with：fullwidth colon\nMore text：here"),
+        ]
+        result = preprocess_text(pages)
+
+        # Should have page break markers
+        assert "[PAGE_BREAK: 1]" in result
+        assert "[PAGE_BREAK: 2]" in result
+
+        # Fullwidth colons normalized to ASCII
+        assert "：" not in result
+        assert ":" in result
+
+        # Page footer patterns should be stripped (X-1, dates like 2024-01-15)
+        assert "X-1" not in result
+        assert "2024-01-15" not in result
+
+
+# ---------------------------------------------------------------------------
+# Test: build_structure_detection_prompt
+# ---------------------------------------------------------------------------
+
+class TestBuildPrompt:
+
+    def test_build_structure_detection_prompt(self):
+        """Prompt contains key instructions for LLM classification."""
+        text = "Sample document text [PAGE_BREAK: 1]"
+        prompt = build_structure_detection_prompt(text)
+        assert "Hong Kong Legislative Council" in prompt or "Legislative Council" in prompt
+        assert "qa" in prompt.lower() or "問" in prompt
+        assert "narrative" in prompt.lower()
+        assert "speaking_notes" in prompt.lower() or "speaking notes" in prompt.lower()
+        assert text in prompt
--- a/backend/app/utils/chunking.py
+++ b/backend/app/utils/chunking.py
@ -6,8 +6,15 @@ token-based windows.
 """
 from __future__ import annotations

+import logging
 from abc import ABC, abstractmethod
-from typing import List, Tuple
+from typing import TYPE_CHECKING, List, Optional, Tuple
+
+if TYPE_CHECKING:
+    from app.core.config import Settings
+    from app.services.llm_client import LLMClient
+
+logger = logging.getLogger(__name__)


 class ChunkingStrategy(ABC):
@ -117,3 +124,111 @@ class TokenChunkingStrategy(ChunkingStrategy):
            results.append(("\n".join(parts), page_num))

        return results
+
+
+class QuestionChunkingStrategy(ChunkingStrategy):
+    """Chunk text by detecting Q&A structure using LLM and/or regex patterns.
+
+    Designed for LegCo documents with explicit 問/答 or Q1/Q2 markers.
+    Falls back to section-based chunking for narrative-only documents.
+    """
+
+    def __init__(
+        self,
+        settings: "Settings",
+        llm_client: Optional["LLMClient"] = None,
+    ):
+        self._settings = settings
+        self._llm_client = llm_client
+        self._max_tokens = getattr(settings, "qa_max_chunk_tokens", 3000)
+        self._chunk_metadata: List[dict] = []
+
+    def chunk(self, text: str) -> List[str]:
+        """Split text into chunks using Q&A detection (for DOCX/TXT)."""
+        if not text or not text.strip():
+            return []
+
+        from app.utils.qa_chunking import (
+            split_chinese_qa,
+            split_english_qa,
+            build_chunks_from_sections,
+            Section,
+        )
+
+        sections = split_chinese_qa(text)
+        if not sections:
+            sections = split_english_qa(text)
+
+        if not sections:
+            sections = [Section(type="narrative", content=text, start_page=1, end_page=1)]
+
+        results = build_chunks_from_sections(sections, max_tokens=self._max_tokens)
+        self._chunk_metadata = [meta for _, _, meta in results]
+        return [chunk_text for chunk_text, _, _ in results]
+
+    def chunk_pages(
+        self, pages: List[Tuple[int, str]], overlap_tokens: int = 0
+    ) -> List[Tuple[str, int]]:
+        """Split page-segmented text using Q&A detection (for PDF).
+
+        Returns list of (chunk_text, page_number) where page_number
+        references the question location for Q&A chunks.
+        """
+        if not pages:
+            return []
+
+        from app.utils.qa_chunking import (
+            preprocess_text,
+            split_chinese_qa,
+            split_english_qa,
+            build_chunks_from_sections,
+            parse_llm_structure_response,
+            build_structure_detection_prompt,
+            Section,
+        )
+
+        full_text = preprocess_text(pages)
+
+        sections = split_chinese_qa(full_text)
+        if not sections:
+            sections = split_english_qa(full_text)
+
+        if not sections and self._llm_client is not None:
+            import asyncio
+            prompt = build_structure_detection_prompt(full_text)
+            try:
+                loop = asyncio.get_event_loop()
+                if loop.is_running():
+                    sections = []
+                else:
+                    response = loop.run_until_complete(
+                        self._llm_client.complete(prompt, temperature=0.3, step_name="StructureDetection")
+                    )
+                    sections = parse_llm_structure_response(response)
+            except Exception:
+                logger.warning("LLM structure detection failed, using fallback", exc_info=True)
+
+        if not sections:
+            sections = [Section(type="narrative", content=full_text, start_page=1, end_page=len(pages))]
+
+        results = build_chunks_from_sections(sections, max_tokens=self._max_tokens)
+        self._chunk_metadata = [meta for _, _, meta in results]
+        return [(chunk_text, page_num) for chunk_text, page_num, _ in results]
+
+
+def get_chunking_strategy(name: str, settings: "Settings") -> ChunkingStrategy:
+    """Factory: return the named chunking strategy.
+
+    Args:
+        name: "token" or "question"
+        settings: Application settings instance.
+
+    Returns:
+        ChunkingStrategy instance.
+    """
+    if name == "question":
+        return QuestionChunkingStrategy(settings=settings)
+    return TokenChunkingStrategy(
+        chunk_size=settings.chunk_size,
+        overlap=settings.chunk_overlap,
+    )
--- a/backend/app/utils/metadata.py
+++ b/backend/app/utils/metadata.py
@ -12,6 +12,8 @@ def extract_metadata(
    page_numbers: List[int | None] | None = None,
    chunk_file_paths: List[str | None] | None = None,
    document_id: str | None = None,
+    strategy_type: str = "token",
+    chunk_metadata: List[Dict[str, Any]] | None = None,
 ) -> List[Dict[str, Any]]:
    """Extract metadata for a list of text chunks.

@ -23,6 +25,10 @@ def extract_metadata(
    - chunk_file_path: path to the per-chunk source file
    - document_id: unique identifier linking all chunks to the same document

+    Package 8 Q&A fields (present when chunk_metadata provided):
+    - strategy_type, section_type, question_index, question_id, question_text,
+      section_heading, answer_contains_table, source_page_range, parent_topic
+
    Args:
        file_path: Path to the file associated with the chunks.
        chunks: List of string chunks to generate metadata for.
@ -31,6 +37,12 @@ def extract_metadata(
        page_numbers: Optional per-chunk page numbers. Length must match chunks.
        chunk_file_paths: Optional per-chunk source file paths. Length must match chunks.
        document_id: Optional unique document identifier applied to all chunks.
+        strategy_type: Chunking strategy used ("token" or "question"). Stored in
+            each chunk's metadata.
+        chunk_metadata: Optional per-chunk metadata dicts from Q&A strategy.
+            Each dict is merged into the corresponding base metadata entry.
+            Length must match chunks. Fields like question_id, question_index,
+            section_type, etc. are forwarded to ChromaDB metadata.

    Returns:
        A list of metadata dictionaries, one per chunk. Empty list if chunks is empty.
@ -55,6 +67,11 @@ def extract_metadata(
            f"chunk_file_paths length ({len(chunk_file_paths)}) does not match chunks length ({len(chunks)})"
        )

+    if chunk_metadata is not None and len(chunk_metadata) > 0 and len(chunk_metadata) != len(chunks):
+        raise ValueError(
+            f"chunk_metadata length ({len(chunk_metadata)}) does not match chunks length ({len(chunks)})"
+        )
+
    filename = original_filename if original_filename else os.path.basename(file_path)
    upload_date = datetime.now().isoformat()

@ -68,6 +85,7 @@ def extract_metadata(
            "content_summary": content_summary,
            "chunk_index": idx,
            "document_id": document_id,
+            "strategy_type": strategy_type,
        }
        page_num = page_numbers[idx] if page_numbers else None
        if page_num is not None:
@ -75,6 +93,8 @@ def extract_metadata(
        cfp = chunk_file_paths[idx] if chunk_file_paths else None
        if cfp is not None:
            entry["chunk_file_path"] = cfp
+        if chunk_metadata:
+            entry.update(chunk_metadata[idx])
        metadata.append(entry)

    return metadata
--- a/backend/app/utils/qa_chunking.py
+++ b/backend/app/utils/qa_chunking.py
@ -0,0 +1,361 @@
+"""Q&A-pair chunking utilities for Package 8.
+
+Provides section detection (LLM + regex), text preprocessing,
+and chunk building for LegCo documents with Q&A structure.
+"""
+from __future__ import annotations
+
+import json
+import logging
+import re
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Section:
+    """A detected section within a LegCo document."""
+    type: str  # "qa" | "narrative" | "speaking_notes" | "table" | "toc" | "heading_only"
+    heading: str = ""
+    qa_id: Optional[str] = None
+    question: Optional[str] = None
+    answer: Optional[str] = None
+    content: str = ""
+    start_page: int = 1
+    end_page: int = 1
+    has_table: bool = False
+    parent_topic: str = ""
+
+
+_FOOTER_RE = re.compile(r"^[A-Z]-\d+\s*$", re.MULTILINE)
+_FOOTER_DATE_RE = re.compile(r"^[A-Z]-\d+\s*\n\d{4}-\d{2}-\d{2}$", re.MULTILINE)
+_HEADER_LETTER_RE = re.compile(r"^(\([A-Z]\))\s*$", re.MULTILINE)
+_FULLWIDTH_COLON_RE = re.compile("[︰：]")
+
+
+def preprocess_text(pages: List[Tuple[int, str]]) -> str:
+    """Concatenate pages, strip footers/headers, normalize colons, insert [PAGE_BREAK: N] markers."""
+    parts: List[str] = []
+    for idx, (page_num, page_text) in enumerate(pages):
+        text = _FOOTER_DATE_RE.sub("", page_text)
+        text = _FOOTER_RE.sub("", text)
+        if idx > 0:
+            text = _HEADER_LETTER_RE.sub("", text)
+        text = _FULLWIDTH_COLON_RE.sub(":", text)
+        parts.append(f"[PAGE_BREAK: {page_num}]\n{text}")
+    return "\n".join(parts)
+
+
+_STRUCTURE_PROMPT_TEMPLATE = """You are analyzing a Hong Kong Legislative Council document.
+The text has page markers like [PAGE_BREAK: N] showing where pages begin.
+
+For each distinct section in this document, identify:
+1. The section type:
+   - "qa": a question-and-answer pair (問/答 or Q1/Q2 format)
+   - "narrative": policy text, explanatory paragraphs, section content with bullets
+   - "speaking_notes": briefing points (發言要點) with bullet markers
+   - "table": standalone data tables (not embedded in answers)
+   - "toc": table of contents
+   - "heading_only": a section heading with no following content
+
+2. For "qa" sections:
+   - The question text (exact)
+   - The answer text (exact, including tables, bullet lists, and [內部參考] content)
+   - The question ID if present (e.g. "A1", "Q3")
+   - The start page and end page
+
+3. For all sections:
+   - The section heading (e.g. "(A) 排水系統", "(1) 住戶的安置補償")
+   - The start page and end page
+   - Whether the section contains tables
+
+Return JSON:
+{{
+  "sections": [
+    {{
+      "type": "qa",
+      "heading": "(A) 排水系統",
+      "qa_id": "A1",
+      "question": "...",
+      "answer": "...",
+      "start_page": 2,
+      "end_page": 3,
+      "has_table": true,
+      "parent_topic": "排水系統"
+    }},
+    {{
+      "type": "narrative",
+      "heading": "(1) 住戶的安置補償",
+      "content": "...",
+      "start_page": 2,
+      "end_page": 5,
+      "has_table": false
+    }}
+  ]
+}}
+
+DOCUMENT TEXT:
+{document_text}"""
+
+
+def build_structure_detection_prompt(text: str) -> str:
+    """Construct the LLM prompt for section classification."""
+    return _STRUCTURE_PROMPT_TEMPLATE.format(document_text=text)
+
+
+_MARKDOWN_FENCE_RE = re.compile(r"```(?:json)?\s*\n?(.*?)\n?```", re.DOTALL)
+
+
+def parse_llm_structure_response(response_text: str) -> List[Section]:
+    """Parse the JSON returned by the LLM. Handle markdown code fences.
+
+    Raises ValueError if response is not valid JSON.
+    """
+    cleaned = response_text.strip()
+    fence_match = _MARKDOWN_FENCE_RE.search(cleaned)
+    if fence_match:
+        cleaned = fence_match.group(1).strip()
+
+    try:
+        data = json.loads(cleaned)
+    except json.JSONDecodeError as exc:
+        raise ValueError(f"Invalid JSON from LLM structure detection: {exc}") from exc
+
+    sections_raw = data.get("sections", [])
+    sections: List[Section] = []
+    for raw in sections_raw:
+        sections.append(Section(
+            type=raw.get("type", "narrative"),
+            heading=raw.get("heading", ""),
+            qa_id=raw.get("qa_id"),
+            question=raw.get("question"),
+            answer=raw.get("answer"),
+            content=raw.get("content", ""),
+            start_page=raw.get("start_page", 1),
+            end_page=raw.get("end_page", 1),
+            has_table=raw.get("has_table", False),
+            parent_topic=raw.get("parent_topic", ""),
+        ))
+    return sections
+
+
+_CN_QA_RE = re.compile(
+    r"問\s*([A-Z]\d+)\s*[︰：:]\s*(.*?)\s*"
+    r"(?:\n\s*答\s*\1\s*[︰：:]\s*(.*?)\s*)"
+    r"(?=\n\s*(?:問\s*[A-Z]\d+\s*[︰：:]|$))",
+    re.DOTALL,
+)
+
+
+def split_chinese_qa(text: str) -> List[Section]:
+    """Regex fast-pass for 問/答 format. Returns empty list if no matches found."""
+    sections: List[Section] = []
+    for m in _CN_QA_RE.finditer(text):
+        qa_id = m.group(1)
+        question = m.group(2).strip()
+        answer = (m.group(3) or "").strip()
+        sections.append(Section(
+            type="qa",
+            qa_id=qa_id,
+            question=question,
+            answer=answer,
+        ))
+    return sections
+
+
+_EN_QA_RE = re.compile(
+    r"^(Q\d+)\s+(.*?)\s*$\n((?:(?!^Q\d+).+(?:\n|$))*)",
+    re.MULTILINE,
+)
+
+
+def split_english_qa(text: str) -> List[Section]:
+    """Regex fast-pass for Q-number format. Returns empty list if no matches found."""
+    sections: List[Section] = []
+    for m in _EN_QA_RE.finditer(text):
+        qa_id = m.group(1)
+        question = m.group(2).strip()
+        answer = m.group(3).strip()
+        sections.append(Section(
+            type="qa",
+            qa_id=qa_id,
+            question=question,
+            answer=answer,
+        ))
+    return sections
+
+
+def _estimate_tokens(text: str) -> int:
+    """Rough token estimate: ~1.3 tokens per CJK char, ~1 token per 4 chars for Latin."""
+    cjk_count = 0
+    latin_len = 0
+    for ch in text:
+        if "\u4e00" <= ch <= "\u9fff":
+            cjk_count += 1
+        else:
+            latin_len += 1
+    return int(cjk_count * 1.3 + latin_len / 4)
+
+
+def _split_oversized_qa(
+    question: str, answer: str, page: int, heading: str,
+    qa_id: Optional[str], question_index: int, has_table: bool,
+    parent_topic: str, start_page: int, end_page: int,
+    max_tokens: int,
+) -> List[Tuple[str, int, dict]]:
+    """Recursively split an oversized Q&A answer with question prepended to each sub-chunk."""
+    # Try paragraph boundaries first
+    parts = answer.split("\n\n")
+    if len(parts) <= 1:
+        parts = answer.split("\n")
+
+    # Group parts into sub-chunks that fit within max_tokens
+    sub_chunks: List[str] = []
+    current = ""
+    for part in parts:
+        candidate = (current + "\n\n" + part) if current else part
+        if _estimate_tokens(f"Question: {question}\n\nAnswer (part 1/N): {candidate}") > max_tokens and current:
+            sub_chunks.append(current)
+            current = part
+        else:
+            current = candidate
+    if current:
+        sub_chunks.append(current)
+
+    total = len(sub_chunks)
+    results: List[Tuple[str, int, dict]] = []
+    for i, sub in enumerate(sub_chunks):
+        chunk_text = f"Question: {question}\n\nAnswer (part {i + 1}/{total}): {sub}"
+        meta = {
+            "strategy_type": "question",
+            "section_type": "qa",
+            "question_index": question_index,
+            "question_id": qa_id,
+            "question_text": question,
+            "section_heading": heading,
+            "answer_contains_table": has_table,
+            "source_page_range": [start_page, end_page],
+            "parent_topic": parent_topic,
+        }
+        results.append((chunk_text, page, meta))
+    return results
+
+
+def build_chunks_from_sections(
+    sections: List[Section], max_tokens: int = 3000,
+) -> List[Tuple[str, int, dict]]:
+    """Build chunk texts + page refs + metadata from sections.
+
+    Returns List[(chunk_text, page_number, metadata_dict)].
+    """
+    chunks: List[Tuple[str, int, dict]] = []
+    qa_index = 0
+
+    for section in sections:
+        if section.type in ("toc", "heading_only"):
+            continue
+
+        if section.type == "qa":
+            question_text = section.question or ""
+            answer_text = section.answer or ""
+            chunk_text = f"Question: {question_text}\n\nAnswer: {answer_text}"
+
+            if section.heading:
+                chunk_text = f"[{section.heading}]\n{chunk_text}"
+
+            page = section.start_page
+            meta: Dict = {
+                "strategy_type": "question",
+                "section_type": "qa",
+                "question_index": qa_index,
+                "question_id": section.qa_id,
+                "question_text": question_text,
+                "section_heading": section.heading,
+                "answer_contains_table": section.has_table,
+                "source_page_range": [section.start_page, section.end_page],
+                "parent_topic": section.parent_topic,
+            }
+
+            if _estimate_tokens(chunk_text) > max_tokens:
+                chunks.extend(_split_oversized_qa(
+                    question=question_text,
+                    answer=answer_text,
+                    page=page,
+                    heading=section.heading,
+                    qa_id=section.qa_id,
+                    question_index=qa_index,
+                    has_table=section.has_table,
+                    parent_topic=section.parent_topic,
+                    start_page=section.start_page,
+                    end_page=section.end_page,
+                    max_tokens=max_tokens,
+                ))
+            else:
+                chunks.append((chunk_text, page, meta))
+
+            qa_index += 1
+
+        elif section.type == "narrative":
+            content = section.content
+            if not content.strip():
+                continue
+            prefix = f"[{section.heading}]\n" if section.heading else ""
+            chunk_text = f"{prefix}{content}"
+            meta = {
+                "strategy_type": "question",
+                "section_type": "narrative",
+                "section_heading": section.heading,
+                "source_page_range": [section.start_page, section.end_page],
+            }
+            if _estimate_tokens(chunk_text) <= max_tokens:
+                chunks.append((chunk_text, section.start_page, meta))
+            else:
+                paragraphs = content.split("\n\n")
+                current = ""
+                for para in paragraphs:
+                    candidate = (current + "\n\n" + para) if current else para
+                    full = f"{prefix}{candidate}"
+                    if _estimate_tokens(full) > max_tokens and current:
+                        chunks.append((f"{prefix}{current}", section.start_page, dict(meta)))
+                        current = para
+                    else:
+                        current = candidate
+                if current:
+                    chunks.append((f"{prefix}{current}", section.start_page, dict(meta)))
+
+        elif section.type == "speaking_notes":
+            content = section.content
+            if not content.strip():
+                continue
+            bullets = re.split(r"(?=⚫)", content)
+            bullets = [b.strip() for b in bullets if b.strip()]
+            if not bullets:
+                bullets = [content]
+            prefix = f"[{section.heading}]\n" if section.heading else ""
+            for bullet in bullets:
+                chunk_text = f"{prefix}{bullet}"
+                meta = {
+                    "strategy_type": "question",
+                    "section_type": "speaking_notes",
+                    "section_heading": section.heading,
+                    "source_page_range": [section.start_page, section.end_page],
+                }
+                chunks.append((chunk_text, section.start_page, meta))
+
+        elif section.type == "table":
+            content = section.content
+            if not content.strip():
+                continue
+            chunk_text = f"[{section.heading}]\n{content}" if section.heading else content
+            meta = {
+                "strategy_type": "question",
+                "section_type": "table",
+                "section_heading": section.heading,
+                "answer_contains_table": True,
+                "source_page_range": [section.start_page, section.end_page],
+            }
+            chunks.append((chunk_text, section.start_page, meta))
+
+    return chunks
--- a/backend/app/utils/table_extraction.py
+++ b/backend/app/utils/table_extraction.py
@ -0,0 +1,147 @@
+"""Table extraction utilities for Package 8.
+
+Provides vision-based and text-based table detection and markdown conversion
+for LegCo documents. Uses the existing LLM model (vision-capable) for
+table-to-markdown conversion.
+"""
+from __future__ import annotations
+
+import hashlib
+import json
+import logging
+import os
+from pathlib import Path
+from typing import List, Optional
+
+logger = logging.getLogger(__name__)
+
+_CACHE_DIR = Path(__file__).resolve().parent.parent.parent / ".cache" / "vision_tables"
+
+
+async def extract_tables_vision(page_images: List[str], llm_client) -> List[str]:
+    """Send page images to vision LLM, get back markdown tables.
+
+    Each page_image is a base64-encoded PNG string.
+    Uses the existing LLM model which supports vision input.
+    """
+    results: List[str] = []
+    prompt = (
+        "Convert this page to Markdown. For any tables:\n"
+        "- Use proper markdown table syntax with |---|---| alignment\n"
+        "- Preserve all column headers and row labels\n"
+        "- Do not modify or translate the content\n"
+        "- If a table spans multiple pages, note it"
+    )
+    for idx, img_b64 in enumerate(page_images):
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/png;base64,{img_b64}"},
+                    },
+                ],
+            }
+        ]
+        try:
+            response = await llm_client._client.chat.completions.create(
+                model=llm_client.model,
+                messages=messages,
+                temperature=0.1,
+            )
+            content = response.choices[0].message.content or ""
+            if content.strip():
+                results.append(content.strip())
+        except Exception:
+            logger.warning("Vision table extraction failed for page image %d", idx, exc_info=True)
+    return results
+
+
+_TABLE_HEURISTIC_RE = [
+    r"(?:\|[\s\-:]+\|)",
+    r"(?:\+[-=]+\+)",
+    r"(?:(?:\S+\s{2,}){3,}\n)",
+]
+
+_TABLE_REGION_PROMPT = (
+    "Convert this raw table text extracted from a PDF into a markdown table.\n"
+    "Preserve all data exactly. Detect column boundaries and alignment.\n\n"
+    "{table_text}"
+)
+
+
+async def extract_tables_text(text: str, llm_client) -> List[str]:
+    """Detect table-like text regions, send to LLM for markdown conversion."""
+    import re
+
+    regions: List[str] = []
+    lines = text.split("\n")
+    current_region: List[str] = []
+    in_table = False
+
+    for line in lines:
+        is_table_line = any(re.search(pat, line) for pat in _TABLE_HEURISTIC_RE)
+        if is_table_line:
+            in_table = True
+            current_region.append(line)
+        elif in_table and line.strip():
+            current_region.append(line)
+        else:
+            if len(current_region) >= 3:
+                regions.append("\n".join(current_region))
+            current_region = []
+            in_table = False
+
+    if len(current_region) >= 3:
+        regions.append("\n".join(current_region))
+
+    if not regions:
+        return []
+
+    results: List[str] = []
+    for region in regions:
+        prompt = _TABLE_REGION_PROMPT.format(table_text=region)
+        try:
+            response = await llm_client.complete(prompt, temperature=0.1, step_name="TableExtraction")
+            if response.strip():
+                results.append(response.strip())
+        except Exception:
+            logger.warning("Text-based table extraction failed", exc_info=True)
+    return results
+
+
+def inject_tables_into_answer(answer: str, tables_md: List[str]) -> str:
+    """Replace raw table text regions in answer with markdown tables."""
+    if not tables_md:
+        return answer
+    result = answer
+    for table_md in tables_md:
+        lines = table_md.split("\n")
+        if not lines:
+            continue
+        header_line = lines[0]
+        if header_line.strip() in result:
+            result = result.replace(header_line.strip(), table_md)
+    return result
+
+
+def cache_vision_result(page_hash: str) -> Optional[str]:
+    """Simple disk cache: hash→markdown stored in .cache dir. Returns None on miss."""
+    cache_file = _CACHE_DIR / f"{page_hash}.md"
+    if cache_file.exists():
+        return cache_file.read_text(encoding="utf-8")
+    return None
+
+
+def save_vision_result(page_hash: str, markdown: str) -> None:
+    """Save a vision result to the disk cache."""
+    _CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    cache_file = _CACHE_DIR / f"{page_hash}.md"
+    cache_file.write_text(markdown, encoding="utf-8")
+
+
+def compute_page_hash(page_image_b64: str) -> str:
+    """Compute a hash for a page image for cache key purposes."""
+    return hashlib.sha256(page_image_b64.encode("utf-8")).hexdigest()[:16]
--- a/frontend/src/components/ChunkList.tsx
+++ b/frontend/src/components/ChunkList.tsx
@ -56,9 +56,32 @@ export const ChunkList: React.FC<ChunkListProps> = ({
              <span className="text-xs font-medium text-gray-500 uppercase">
                Chunk {chunk.chunk_index}
              </span>
-              <span className="text-xs text-gray-400">
-                Page: {chunk.page_number !== null ? chunk.page_number : 'N/A'}
-              </span>
+              {chunk.strategy_type === 'question' && chunk.question_id ? (
+                <>
+                  <span className="text-xs text-gray-600">
+                    Q: {chunk.question_id}{chunk.question_text ? ` — ${chunk.question_text.length > 80 ? `${chunk.question_text.slice(0, 80)}...` : chunk.question_text}` : ''}
+                  </span>
+                  {chunk.topic_section && (
+                    <span className="text-xs text-gray-500">
+                      Topic: {chunk.topic_section}
+                    </span>
+                  )}
+                  {chunk.source_page_range && chunk.source_page_range.length === 2 && (
+                    <span className="text-xs text-gray-400">
+                      Pages {chunk.source_page_range[0]}-{chunk.source_page_range[1]}
+                    </span>
+                  )}
+                  {chunk.has_table && (
+                    <span className="inline-flex items-center px-1.5 py-0.5 rounded text-xs font-medium bg-amber-100 text-amber-700">
+                      Contains table
+                    </span>
+                  )}
+                </>
+              ) : (
+                <span className="text-xs text-gray-400">
+                  Page: {chunk.page_number !== null ? chunk.page_number : 'N/A'}
+                </span>
+              )}
            </div>
            <div className="text-sm text-gray-700 truncate" title={chunk.content_summary}>
              {chunk.content_summary.length > 100
@ -67,7 +90,12 @@ export const ChunkList: React.FC<ChunkListProps> = ({
            </div>
            {chunk.chunk_file_path && (
              <a
-                href={getPdfViewerUrl(chunk.chunk_file_path, chunk.page_number ?? undefined)}
+                href={getPdfViewerUrl(
+                  chunk.chunk_file_path,
+                  chunk.strategy_type === 'question' && chunk.source_page_range && chunk.source_page_range.length > 0
+                    ? chunk.source_page_range[0]
+                    : chunk.page_number ?? undefined
+                )}
                target="_blank"
                rel="noopener noreferrer"
                className="inline-flex items-center mt-1 text-xs text-blue-600 hover:text-blue-800 hover:underline"
--- a/frontend/src/components/DocumentList.tsx
+++ b/frontend/src/components/DocumentList.tsx
@ -29,7 +29,18 @@ export const DocumentList: React.FC<DocumentListProps> = ({
            <div className="flex items-center space-x-3 flex-1">
              <FileText className="w-5 h-5 text-gray-500 flex-shrink-0" />
              <div className="flex-1 min-w-0">
-                <div className="font-medium text-gray-900 truncate">{doc.filename}</div>
+                <div className="flex items-center space-x-2">
+                  <span className="font-medium text-gray-900 truncate">{doc.filename}</span>
+                  {doc.chunking_strategy === 'question' ? (
+                    <span className="inline-flex items-center px-2 py-0.5 rounded text-xs font-medium bg-blue-100 text-blue-700">
+                      chunked by question
+                    </span>
+                  ) : (
+                    <span className="inline-flex items-center px-2 py-0.5 rounded text-xs font-medium bg-gray-100 text-gray-700">
+                      chunked by token
+                    </span>
+                  )}
+                </div>
                <div className="text-sm text-gray-500">
                  {doc.chunk_count} chunks • Uploaded {doc.upload_date}
                </div>
--- a/frontend/src/lib/api.ts
+++ b/frontend/src/lib/api.ts
@ -1,5 +1,5 @@
 import axios from 'axios'
-import type { QueryRequest, QueryResponse, QueryStreamEvent, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, FullTranscriptResponse, VideoUploadResponse } from '../types'
+import type { ChunkingStrategy, QueryRequest, QueryResponse, QueryStreamEvent, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, FullTranscriptResponse, VideoUploadResponse } from '../types'

 const BASE_URL: string = import.meta.env.VITE_API_BASE_URL ?? 'http://localhost:8000/api/v1'

@ -48,10 +48,10 @@ export const queryDocumentStream = async (
  }
 }

-export const ingestDocument = async (file: File): Promise<IngestResponse> => {
+export const ingestDocument = async (file: File, strategy: ChunkingStrategy = 'token'): Promise<IngestResponse> => {
  const form = new FormData()
  form.append('file', file)
-  const resp = await apiClient.post<IngestResponse>('/ingest', form, {
+  const resp = await apiClient.post<IngestResponse>(`/ingest?strategy=${strategy}`, form, {
    headers: { 'Content-Type': 'multipart/form-data' },
  })
  return resp.data
--- a/frontend/src/lib/queries.tsx
+++ b/frontend/src/lib/queries.tsx
@ -1,7 +1,7 @@
 import React from 'react'
 import { QueryClient, QueryClientProvider, useMutation, useQuery, useQueryClient } from '@tanstack/react-query'
 import { queryDocument, queryDocumentStream, ingestDocument, listDocuments, listChunks, deleteDocument, deleteChunk, listPromptProfiles, getPromptProfile, activatePromptProfile, updatePrompt, updateAllPrompts, resetPrompts, exportProfile, importProfile, listQueryHistory, getQueryHistoryDetail, deleteQueryHistory, clearQueryHistory, getHistoryStats, uploadVideo } from './api'
-import type { QueryRequest, QueryResponse, QueryStreamEvent, SourceMetadata, SubQuestionSources, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, VideoUploadResponse } from '../types'
+import type { QueryRequest, QueryResponse, QueryStreamEvent, SourceMetadata, SubQuestionSources, ChunkingStrategy, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, VideoUploadResponse } from '../types'
 import { useState, useCallback, useRef } from 'react'

 export const queryClient = new QueryClient()
@ -185,8 +185,8 @@ export const useQueryDocumentStream = () => {
 }

 export const useIngestDocument = () => {
-  return useMutation<IngestResponse, Error, File>({
-    mutationFn: ingestDocument,
+  return useMutation<IngestResponse, Error, { file: File; strategy: ChunkingStrategy }>({
+    mutationFn: ({ file, strategy }) => ingestDocument(file, strategy),
  })
 }

--- a/frontend/src/pages/RAGDatabasePage.tsx
+++ b/frontend/src/pages/RAGDatabasePage.tsx
@ -1,10 +1,11 @@
 import React, { useState, useCallback, useMemo } from 'react'
-import { Database, AlertCircle, CheckCircle, XCircle, Loader2 } from 'lucide-react'
+import { Database, AlertCircle, CheckCircle, XCircle, Loader2, Type, MessageSquare } from 'lucide-react'
 import { useQueryClient } from '@tanstack/react-query'
 import { useDocuments, useDocumentChunks, useDeleteDocument, useDeleteChunk, useIngestDocument } from '../lib/queries'
 import { DocumentList } from '../components/DocumentList'
 import { ChunkList } from '../components/ChunkList'
 import { DocumentUpload } from '../components/DocumentUpload'
+import type { ChunkingStrategy } from '../types'

 interface FileUploadEntry {
  name: string
@ -22,6 +23,7 @@ export const RAGDatabasePage: React.FC = () => {
  const initialDocId = useMemo(() => getDocumentIdFromUrl(), [])
  const [expandedId, setExpandedId] = useState<string | null>(initialDocId)
  const [uploadEntries, setUploadEntries] = useState<FileUploadEntry[]>([])
+  const [chunkingStrategy, setChunkingStrategy] = useState<ChunkingStrategy>('token')

  const { data: documentsData, isLoading: isLoadingDocuments, error: documentsError } = useDocuments()
  const { data: chunks, isLoading: isLoadingChunks } = useDocumentChunks(expandedId)
@ -60,7 +62,7 @@ export const RAGDatabasePage: React.FC = () => {
    const results = await Promise.allSettled(
      files.map(async (file) => {
        try {
-          await ingestDocumentMutation.mutateAsync(file)
+          await ingestDocumentMutation.mutateAsync({ file, strategy: chunkingStrategy })
          setUploadEntries((prev) =>
            prev.map((e) =>
              e.name === file.name ? { ...e, status: 'success' as const } : e
@ -80,7 +82,7 @@ export const RAGDatabasePage: React.FC = () => {

    queryClient.invalidateQueries({ queryKey: ['documents'] })
    setTimeout(() => setUploadEntries([]), 5000)
-  }, [ingestDocumentMutation, queryClient])
+  }, [ingestDocumentMutation, queryClient, chunkingStrategy])

  const uploadingCount = uploadEntries.filter((e) => e.status === 'uploading').length
  const successCount = uploadEntries.filter((e) => e.status === 'success').length
@ -127,6 +129,42 @@ export const RAGDatabasePage: React.FC = () => {
          />
        </div>

+        <div className="mt-3 flex items-center space-x-4">
+          <span className="text-sm font-medium text-gray-700">Chunking strategy:</span>
+          <div className="flex items-center space-x-3">
+            <label className="flex items-center space-x-2 cursor-pointer">
+              <input
+                type="radio"
+                name="chunking-strategy"
+                value="token"
+                checked={chunkingStrategy === 'token'}
+                onChange={() => setChunkingStrategy('token')}
+                className="h-4 w-4 text-blue-600 border-gray-300 focus:ring-blue-500"
+              />
+              <Type className="w-4 h-4 text-gray-500" />
+              <div>
+                <span className="text-sm font-medium text-gray-900">Chunk by Token</span>
+                <span className="text-xs text-gray-500 ml-1">Split by token windows with overlap</span>
+              </div>
+            </label>
+            <label className="flex items-center space-x-2 cursor-pointer">
+              <input
+                type="radio"
+                name="chunking-strategy"
+                value="question"
+                checked={chunkingStrategy === 'question'}
+                onChange={() => setChunkingStrategy('question')}
+                className="h-4 w-4 text-blue-600 border-gray-300 focus:ring-blue-500"
+              />
+              <MessageSquare className="w-4 h-4 text-gray-500" />
+              <div>
+                <span className="text-sm font-medium text-gray-900">Chunk by Question</span>
+                <span className="text-xs text-gray-500 ml-1">Detect Q&A pairs, extract tables</span>
+              </div>
+            </label>
+          </div>
+        </div>
+
        {hasEntries && (
          <div className="mt-4 space-y-2">
            <div className="text-sm font-medium text-gray-600">
--- a/frontend/src/types/index.ts
+++ b/frontend/src/types/index.ts
@ -1,3 +1,5 @@
+export type ChunkingStrategy = 'token' | 'question'
+
 export interface SourceMetadata {
  filename: string
  upload_date: string
@ -40,6 +42,7 @@ export interface IngestResponse {
  document_id: string
  chunk_count: number
  filename: string
+  strategy: ChunkingStrategy
 }

 export interface DocumentInfo {
@ -47,6 +50,7 @@ export interface DocumentInfo {
  filename: string
  chunk_count: number
  upload_date: string
+  chunking_strategy: ChunkingStrategy
 }

 export interface ChunkInfo {
@ -55,6 +59,13 @@ export interface ChunkInfo {
  content_summary: string
  page_number: number | null
  chunk_file_path: string | null
+  strategy_type: ChunkingStrategy
+  question_index: number | null
+  question_id: string | null
+  question_text: string | null
+  topic_section: string | null
+  source_page_range: number[] | null
+  has_table: boolean | null
 }

 export interface DocumentListResponse {