Merge branch 'RAG-workflow'

This commit is contained in:
Woody 2026-05-15 13:35:54 +08:00
commit f637ab10a5
21 changed files with 1753 additions and 25 deletions

View File

@ -327,7 +327,7 @@ For each section in the JSON response:
If `qa_vision_enabled=false` or for cost optimization, use regex as a fast pre-filter. The LLM call is skipped for documents where regex confidently identifies all boundaries, and used only when: If `qa_vision_enabled=false` or for cost optimization, use regex as a fast pre-filter. The LLM call is skipped for documents where regex confidently identifies all boundaries, and used only when:
- No regex pattern matches (unknown format) - No regex pattern matches (unknown format)
- Regex produces < 2 sections (likely misdetection) - Regex produces < 2 sections (likely misdetection)
- `qa_verification_model` is not set to `"none"` - `qa_structure_model` is not set to `"none"`
### Algorithm Detail: Table-to-Markdown ### Algorithm Detail: Table-to-Markdown
@ -382,7 +382,7 @@ class Settings(BaseSettings):
# NEW: Q&A chunking config # NEW: Q&A chunking config
qa_vision_enabled: bool = True # Toggle vision-based table extraction (uses existing LLM_MODEL_NAME) qa_vision_enabled: bool = True # Toggle vision-based table extraction (uses existing LLM_MODEL_NAME)
qa_max_chunk_tokens: int = 3000 # Max tokens per Q&A chunk (before forced split) qa_max_chunk_tokens: int = 3000 # Max tokens per Q&A chunk (before forced split)
qa_verification_model: str = "" # LLM for boundary verification (empty = use LLM_MODEL_NAME) qa_structure_model: str = "" # LLM for structure detection (empty = use LLM_MODEL_NAME)
qa_include_internal_refs: bool = True # Include [內部參考] in chunks qa_include_internal_refs: bool = True # Include [內部參考] in chunks
qa_cache_vision_results: bool = True # Cache vision results per page qa_cache_vision_results: bool = True # Cache vision results per page
@ -390,7 +390,7 @@ class Settings(BaseSettings):
# DEFAULT_CHUNKING_STRATEGY=token # DEFAULT_CHUNKING_STRATEGY=token
# QA_VISION_ENABLED=true # QA_VISION_ENABLED=true
# QA_MAX_CHUNK_TOKENS=3000 # QA_MAX_CHUNK_TOKENS=3000
# QA_VERIFICATION_MODEL= # QA_STRUCTURE_MODEL=
# QA_INCLUDE_INTERNAL_REFS=true # QA_INCLUDE_INTERNAL_REFS=true
# QA_CACHE_VISION_RESULTS=true # QA_CACHE_VISION_RESULTS=true

View File

@ -41,3 +41,11 @@ MAX_VIDEO_SIZE_MB=300
# Set to false to disable System Audio or Listen Mic capture # Set to false to disable System Audio or Listen Mic capture
SYSTEM_AUDIO_ENABLED=true SYSTEM_AUDIO_ENABLED=true
MIC_ENABLED=true MIC_ENABLED=true
# Q&A-pair chunking (Package 8)
DEFAULT_CHUNKING_STRATEGY=token
QA_VISION_ENABLED=true
QA_MAX_CHUNK_TOKENS=3000
QA_STRUCTURE_MODEL=
QA_INCLUDE_INTERNAL_REFS=true
QA_CACHE_VISION_RESULTS=true

View File

@ -44,6 +44,14 @@ class Settings(BaseSettings):
relevance_threshold: float = 7.0 relevance_threshold: float = 7.0
llm_timeout: float = 60.0 llm_timeout: float = 60.0
# Q&A-pair chunking strategy (Package 8)
default_chunking_strategy: str = "token"
qa_vision_enabled: bool = True
qa_max_chunk_tokens: int = 3000
qa_structure_model: str = ""
qa_include_internal_refs: bool = True
qa_cache_vision_results: bool = True
# Alibaba Cloud DashScope ASR (Phase 2) # Alibaba Cloud DashScope ASR (Phase 2)
dashscope_api_key: str = "" dashscope_api_key: str = ""
asr_model_name: str = "qwen3-asr-flash" asr_model_name: str = "qwen3-asr-flash"

View File

@ -8,6 +8,7 @@ class DocumentInfo(BaseModel):
filename: str filename: str
chunk_count: int chunk_count: int
upload_date: str upload_date: str
chunking_strategy: str = "token"
class ChunkInfo(BaseModel): class ChunkInfo(BaseModel):
@ -16,6 +17,14 @@ class ChunkInfo(BaseModel):
content_summary: str content_summary: str
page_number: Optional[int] = None page_number: Optional[int] = None
chunk_file_path: Optional[str] = None chunk_file_path: Optional[str] = None
strategy_type: Optional[str] = None
question_index: Optional[int] = None
question_id: Optional[str] = None
question_text: Optional[str] = None
section_heading: Optional[str] = None
answer_contains_table: Optional[bool] = None
source_page_range: Optional[List[int]] = None
parent_topic: Optional[str] = None
class DocumentListResponse(BaseModel): class DocumentListResponse(BaseModel):

View File

@ -1,7 +1,18 @@
from typing import Literal
from pydantic import BaseModel from pydantic import BaseModel
ChunkingStrategyType = Literal["token", "question"]
VALID_CHUNKING_STRATEGIES = frozenset({"token", "question"})
class IngestRequest(BaseModel):
strategy: ChunkingStrategyType = "token"
class IngestResponse(BaseModel): class IngestResponse(BaseModel):
document_id: str document_id: str
chunk_count: int chunk_count: int
filename: str filename: str
strategy: ChunkingStrategyType = "token"

View File

@ -5,9 +5,9 @@ import tempfile
import uuid import uuid
from pathlib import Path from pathlib import Path
from fastapi import APIRouter, UploadFile, File, HTTPException from fastapi import APIRouter, UploadFile, File, HTTPException, Query
from app.models.ingest import IngestResponse from app.models.ingest import IngestResponse, VALID_CHUNKING_STRATEGIES
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
router = APIRouter(tags=["ingest"]) router = APIRouter(tags=["ingest"])
@ -37,11 +37,14 @@ def _delete_existing_document(rag, filename: str, chunk_dir: str) -> None:
@router.post("/ingest", response_model=IngestResponse) @router.post("/ingest", response_model=IngestResponse)
async def ingest_document(file: UploadFile = File(...)): async def ingest_document(
file: UploadFile = File(...),
strategy: str = Query("token"),
):
"""Ingest a document into the RAG system.""" """Ingest a document into the RAG system."""
from app.core.config import get_settings from app.core.config import get_settings
from app.services.rag import RAGService from app.services.rag import RAGService
from app.utils.chunking import TokenChunkingStrategy from app.utils.chunking import get_chunking_strategy
from app.utils.metadata import extract_metadata from app.utils.metadata import extract_metadata
filename = file.filename or "unknown" filename = file.filename or "unknown"
@ -53,6 +56,12 @@ async def ingest_document(file: UploadFile = File(...)):
detail=f"Unsupported file format: {file_ext}. Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}", detail=f"Unsupported file format: {file_ext}. Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}",
) )
if strategy not in VALID_CHUNKING_STRATEGIES:
raise HTTPException(
status_code=400,
detail=f"Invalid chunking strategy: {strategy}. Valid: {', '.join(sorted(VALID_CHUNKING_STRATEGIES))}",
)
settings = get_settings() settings = get_settings()
temp_path = None temp_path = None
try: try:
@ -68,9 +77,7 @@ async def ingest_document(file: UploadFile = File(...)):
_delete_existing_document(rag, filename, chunk_dir) _delete_existing_document(rag, filename, chunk_dir)
document_id = str(uuid.uuid4()) document_id = str(uuid.uuid4())
chunker = TokenChunkingStrategy( chunker = get_chunking_strategy(strategy, settings)
chunk_size=settings.chunk_size, overlap=settings.chunk_overlap
)
if file_ext == ".pdf": if file_ext == ".pdf":
from app.utils.pdf_parser import parse_pdf_by_page from app.utils.pdf_parser import parse_pdf_by_page
@ -105,6 +112,8 @@ async def ingest_document(file: UploadFile = File(...)):
) )
chunk_file_paths.append(None) chunk_file_paths.append(None)
chunk_metadata = chunker._chunk_metadata if hasattr(chunker, '_chunk_metadata') else None
metadata = extract_metadata( metadata = extract_metadata(
temp_path, temp_path,
chunk_texts, chunk_texts,
@ -112,6 +121,8 @@ async def ingest_document(file: UploadFile = File(...)):
page_numbers=page_numbers, page_numbers=page_numbers,
chunk_file_paths=chunk_file_paths, chunk_file_paths=chunk_file_paths,
document_id=document_id, document_id=document_id,
strategy_type=strategy,
chunk_metadata=chunk_metadata,
) )
rag.ingest_document(temp_path, chunk_texts, metadata, document_id=document_id) rag.ingest_document(temp_path, chunk_texts, metadata, document_id=document_id)
@ -145,9 +156,12 @@ async def ingest_document(file: UploadFile = File(...)):
) )
chunk_file_paths.append(None) chunk_file_paths.append(None)
chunk_metadata = chunker._chunk_metadata if hasattr(chunker, '_chunk_metadata') else None
metadata = extract_metadata( metadata = extract_metadata(
temp_path, chunks, original_filename=filename, temp_path, chunks, original_filename=filename,
chunk_file_paths=chunk_file_paths, document_id=document_id, chunk_file_paths=chunk_file_paths, document_id=document_id,
strategy_type=strategy, chunk_metadata=chunk_metadata,
) )
rag.ingest_document(temp_path, chunks, metadata, document_id=document_id) rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)
@ -180,9 +194,12 @@ async def ingest_document(file: UploadFile = File(...)):
) )
chunk_file_paths.append(None) chunk_file_paths.append(None)
chunk_metadata = chunker._chunk_metadata if hasattr(chunker, '_chunk_metadata') else None
metadata = extract_metadata( metadata = extract_metadata(
temp_path, chunks, original_filename=filename, temp_path, chunks, original_filename=filename,
chunk_file_paths=chunk_file_paths, document_id=document_id, chunk_file_paths=chunk_file_paths, document_id=document_id,
strategy_type=strategy, chunk_metadata=chunk_metadata,
) )
rag.ingest_document(temp_path, chunks, metadata, document_id=document_id) rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)
@ -193,6 +210,7 @@ async def ingest_document(file: UploadFile = File(...)):
document_id=document_id, document_id=document_id,
chunk_count=chunk_count, chunk_count=chunk_count,
filename=filename, filename=filename,
strategy=strategy,
) )
except HTTPException: except HTTPException:

View File

@ -0,0 +1,60 @@
"""Acceptance tests: Phase 8 Q&A-pair chunking with real LTT PDFs.
Prerequisites:
- ChromaDB running (local)
- .env configured with valid LLM_BASE_URL, LLM_API_KEY, LLM_MODEL_NAME
- Test PDFs available in ../../test materials/LTT/
These tests require real LLM calls and actual LegCo PDFs.
Run manually: pytest app/test/acceptance/test_acceptance_phase8_qa_chunking.py -v -m acceptance
"""
import os
import sys
import pytest
@pytest.mark.acceptance
@pytest.mark.slow
class TestRealQaChunking:
"""End-to-end Q&A chunking with real LegCo PDFs from test materials/LTT/."""
@pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
def test_real_qa_chunking_fileE(self):
"""File E produces 12 Chinese Q&A pairs + 3 Others + narrative sections."""
pass
@pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
def test_real_qa_chunking_fileL(self):
"""File L produces 24 English Q&A pairs + narrative sections."""
pass
@pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
def test_real_qa_chunking_fileB(self):
"""File B produces 3 Chinese Q&A pairs + narrative sections."""
pass
@pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
def test_real_qa_chunking_fileA(self):
"""File A falls back to narrative chunking (no Q&A, should not error)."""
pass
@pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
def test_table_extraction_fileE(self):
"""Tables in File E answers converted to markdown."""
pass
@pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
def test_table_extraction_fileL(self):
"""Tables in File L answers converted to markdown."""
pass
@pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
def test_qa_page_references(self):
"""Each Q&A chunk's page number points to question (問) location."""
pass
@pytest.mark.skip(reason="Requires full pipeline with LLM, embeddings, ChromaDB")
def test_full_pipeline_question_strategy(self):
"""Full ingest -> retrieve -> query pipeline with Q&A chunks."""
pass

View File

@ -31,3 +31,47 @@ def test_config_default_values(monkeypatch):
settings = Settings() settings = Settings()
assert settings.llm_base_url == "https://openrouter.ai/api/v1" assert settings.llm_base_url == "https://openrouter.ai/api/v1"
assert settings.llm_model_name == "qwen/qwen3.5-35b-a3b" assert settings.llm_model_name == "qwen/qwen3.5-35b-a3b"
def test_qa_chunking_config_defaults(monkeypatch):
"""Phase 8.0: Q&A chunking config fields have correct defaults."""
monkeypatch.delenv("DEFAULT_CHUNKING_STRATEGY", raising=False)
monkeypatch.delenv("QA_VISION_ENABLED", raising=False)
monkeypatch.delenv("QA_MAX_CHUNK_TOKENS", raising=False)
monkeypatch.delenv("QA_STRUCTURE_MODEL", raising=False)
monkeypatch.delenv("QA_INCLUDE_INTERNAL_REFS", raising=False)
monkeypatch.delenv("QA_CACHE_VISION_RESULTS", raising=False)
from app.core.config import Settings
settings = Settings()
assert settings.default_chunking_strategy == "token"
assert settings.qa_vision_enabled is True
assert settings.qa_max_chunk_tokens == 3000
assert settings.qa_structure_model == ""
assert settings.qa_include_internal_refs is True
assert settings.qa_cache_vision_results is True
def test_qa_chunking_config_from_env(tmp_path, monkeypatch):
"""Phase 8.0: Q&A chunking config fields load from .env."""
env_file = tmp_path / ".env"
env_file.write_text(
"DEFAULT_CHUNKING_STRATEGY=question\n"
"QA_VISION_ENABLED=false\n"
"QA_MAX_CHUNK_TOKENS=5000\n"
"QA_STRUCTURE_MODEL=anthropic/claude-3-haiku\n"
"QA_INCLUDE_INTERNAL_REFS=false\n"
"QA_CACHE_VISION_RESULTS=false\n"
)
monkeypatch.chdir(tmp_path)
from app.core.config import Settings
settings = Settings()
assert settings.default_chunking_strategy == "question"
assert settings.qa_vision_enabled is False
assert settings.qa_max_chunk_tokens == 5000
assert settings.qa_structure_model == "anthropic/claude-3-haiku"
assert settings.qa_include_internal_refs is False
assert settings.qa_cache_vision_results is False

View File

@ -0,0 +1,209 @@
"""Phase 8 tests: Ingest API integration with strategy selection (Sub-Phase 8.3).
Covers:
- POST /api/v1/api/v1/ingest?strategy=token existing behavior unchanged
- POST /api/v1/api/v1/ingest?strategy=question Q&A chunking applied
- Invalid strategy values return 400
- IngestResponse includes strategy field
- DOCX with Q&A format uses question strategy
- Document without Q&A falls back gracefully
"""
import io
import json
from typing import List, Tuple
from unittest.mock import MagicMock
import pytest
from fastapi import FastAPI
from fastapi.testclient import TestClient
from pypdf import PdfWriter
from app.routers.ingest import router
class _DeterministicEmbedding:
def name(self) -> str:
return "test_deterministic"
def __call__(self, input):
return self._embed(input)
def embed_query(self, input):
return self._embed(input)
@staticmethod
def _embed(texts):
vectors = []
for text in texts:
vec = [0.0] * 384
for i, ch in enumerate(text[:384]):
vec[i] = ord(ch) / 1000.0
vectors.append(vec)
return vectors
def _create_real_pdf(content: str) -> bytes:
writer = PdfWriter()
writer.add_blank_page(width=200, height=200)
buf = io.BytesIO()
writer.write(buf)
return buf.getvalue()
def _create_text_txt(content: str) -> bytes:
return content.encode("utf-8")
@pytest.fixture
def client(tmp_path, monkeypatch):
"""TestClient with real ChromaDB isolated in tmp_path + deterministic embeddings."""
chroma_path = str(tmp_path / "chroma_db")
chunk_path = str(tmp_path / "document_chunk")
prompts_path = str(tmp_path / "prompts.db")
history_path = str(tmp_path / "history.db")
monkeypatch.setenv("CHROMA_DB_PATH", chroma_path)
monkeypatch.setenv("DOCUMENT_CHUNK_PATH", chunk_path)
monkeypatch.setenv("PROMPTS_DB_PATH", prompts_path)
monkeypatch.setenv("HISTORY_DB_PATH", history_path)
monkeypatch.setenv("EMBEDDING_MODEL", "test-mock")
monkeypatch.setenv("LLM_API_KEY", "test-key")
from app.core.config import get_settings
get_settings.cache_clear()
from app.core.dependencies import get_settings_cached
get_settings_cached.cache_clear()
from app.core.sqlite_db import _get_db, init_prompts_db, init_history_db, seed_default_profiles
conn = _get_db(prompts_path)
init_prompts_db(conn)
seed_default_profiles(conn)
conn.close()
hconn = _get_db(history_path)
init_history_db(hconn)
hconn.close()
monkeypatch.setattr(
"app.core.database.get_embedding_function_settings",
lambda settings: _DeterministicEmbedding(),
)
test_app = FastAPI()
test_app.include_router(router, prefix="/api/v1")
yield TestClient(test_app)
get_settings_cached.cache_clear()
get_settings.cache_clear()
def test_ingest_with_strategy_token(client):
"""Existing behavior unchanged: strategy=token uses TokenChunkingStrategy."""
txt_bytes = _create_text_txt("This is a test document with enough content to generate chunks.")
resp = client.post(
"/api/v1/ingest?strategy=token",
files={"file": ("test.txt", txt_bytes, "text/plain")},
)
assert resp.status_code == 200
data = resp.json()
assert data["strategy"] == "token"
assert data["chunk_count"] > 0
def test_ingest_invalid_strategy_rejected(client):
"""Invalid strategy values return 400."""
txt_bytes = _create_text_txt("test")
resp = client.post(
"/api/v1/ingest?strategy=invalid",
files={"file": ("test.txt", txt_bytes, "text/plain")},
)
assert resp.status_code == 400
assert "strategy" in resp.json()["detail"].lower()
def test_ingest_response_includes_strategy(client):
"""IngestResponse includes the strategy field."""
txt_bytes = _create_text_txt("Strategy response test content with more text to ensure chunks.")
resp = client.post(
"/api/v1/ingest?strategy=token",
files={"file": ("test.txt", txt_bytes, "text/plain")},
)
assert resp.status_code == 200
assert "strategy" in resp.json()
def test_ingest_default_strategy_is_token(client):
"""When no strategy param provided, default to token."""
txt_bytes = _create_text_txt("Default strategy test with enough text to generate output.")
resp = client.post(
"/api/v1/ingest",
files={"file": ("test.txt", txt_bytes, "text/plain")},
)
assert resp.status_code == 200
assert resp.json()["strategy"] == "token"
def test_ingest_question_strategy_txt(client, monkeypatch):
"""TXT with Q&A format uses question strategy and produces chunks."""
_mock_question_chunker(monkeypatch)
txt_bytes = _create_text_txt("問A1test question\n答A1test answer with more text here to ensure chunking works properly.")
resp = client.post(
"/api/v1/ingest?strategy=question",
files={"file": ("test.txt", txt_bytes, "text/plain")},
)
assert resp.status_code == 200
data = resp.json()
assert data["strategy"] == "question"
assert data["chunk_count"] > 0
def test_ingest_question_strategy_no_qa_fallback(client, monkeypatch):
"""Document without Q&A markers falls back to narrative chunking without error."""
_mock_question_chunker(monkeypatch)
txt_bytes = _create_text_txt("This is plain text without any Q&A markers, but it needs to be long enough to generate at least one chunk when processed by the tokenizer.")
resp = client.post(
"/api/v1/ingest?strategy=question",
files={"file": ("plain.txt", txt_bytes, "text/plain")},
)
assert resp.status_code == 200
data = resp.json()
assert data["strategy"] == "question"
assert data["chunk_count"] > 0
def _mock_question_chunker(monkeypatch):
"""Replace QuestionChunkingStrategy with a mock that returns test chunks."""
class _MockQuestionChunker:
def __init__(self, settings=None, llm_client=None):
self._chunk_metadata = [
{
"strategy_type": "question",
"section_type": "qa",
"question_index": 0,
"question_id": "A1",
"question_text": "What is X?",
"section_heading": "(A) Topic",
"answer_contains_table": False,
"source_page_range": [1, 2],
}
]
self._max_tokens = 3000
def chunk(self, text):
self._chunk_metadata = self._chunk_metadata[:1]
return ["Question: What is X?\n\nAnswer: X is Y."]
def chunk_pages(self, pages, overlap_tokens=0):
self._chunk_metadata = self._chunk_metadata[:1]
return [("Question: What is X?\n\nAnswer: X is Y.", 1)]
monkeypatch.setattr(
"app.utils.chunking.QuestionChunkingStrategy",
_MockQuestionChunker,
)

View File

@ -0,0 +1,149 @@
"""Phase 8 tests: Q&A metadata enrichment (Sub-Phase 8.2).
Covers:
- Metadata enrichment with Q&A-specific fields via chunk_metadata param
- Backward compatibility: token strategy unchanged
- Page number references question location
- Chunk metadata merging with base metadata
"""
import json
import pytest
from app.utils.metadata import extract_metadata
def test_qa_metadata_fields(tmp_path):
"""strategy_type, question_index, question_id, question_text merged via chunk_metadata."""
file_path = tmp_path / "test.pdf"
file_path.write_text("dummy content")
chunks = ["chunk 1", "chunk 2"]
chunk_metadata = [
{
"strategy_type": "question",
"section_type": "qa",
"question_index": 0,
"question_id": "A1",
"question_text": "What is X?",
"section_heading": "(A) Section",
"answer_contains_table": True,
"source_page_range": [2, 5],
"parent_topic": "Topic Name",
},
{
"strategy_type": "question",
"section_type": "qa",
"question_index": 1,
"question_id": "A2",
"question_text": "What is Y?",
"section_heading": "(A) Section",
"answer_contains_table": False,
"source_page_range": [5, 7],
},
]
metadata = extract_metadata(
file_path=str(file_path),
chunks=chunks,
strategy_type="question",
chunk_metadata=chunk_metadata,
)
assert len(metadata) == 2
m0 = metadata[0]
assert m0["strategy_type"] == "question"
assert m0["section_type"] == "qa"
assert m0["question_index"] == 0
assert m0["question_id"] == "A1"
assert m0["question_text"] == "What is X?"
assert m0["section_heading"] == "(A) Section"
assert m0["answer_contains_table"] is True
assert m0["source_page_range"] == [2, 5]
assert m0["parent_topic"] == "Topic Name"
m1 = metadata[1]
assert m1["question_index"] == 1
assert m1["question_id"] == "A2"
assert m1["answer_contains_table"] is False
def test_qa_metadata_topic_section(tmp_path):
"""section_heading and parent_topic are both preserved."""
file_path = tmp_path / "test.pdf"
file_path.write_text("dummy content")
metadata = extract_metadata(
file_path=str(file_path),
chunks=["chunk"],
strategy_type="question",
chunk_metadata=[{"section_heading": "(B) Traffic", "parent_topic": "Traffic Planning"}],
)
assert metadata[0]["section_heading"] == "(B) Traffic"
assert metadata[0]["parent_topic"] == "Traffic Planning"
def test_token_metadata_unchanged(tmp_path):
"""Existing metadata fields unchanged for token strategy (no chunk_metadata)."""
file_path = tmp_path / "test.txt"
file_path.write_text("test content")
metadata = extract_metadata(
file_path=str(file_path),
chunks=["chunk 1", "chunk 2"],
original_filename="original.txt",
strategy_type="token",
)
assert len(metadata) == 2
for m in metadata:
assert "filename" in m
assert "upload_date" in m
assert "content_summary" in m
assert "chunk_index" in m
assert m.get("strategy_type", "token") == "token"
assert "question_id" not in m
def test_page_number_from_question(tmp_path):
"""Page ref should point to question location (pass via page_numbers from strategy)."""
file_path = tmp_path / "test.pdf"
file_path.write_text("dummy content")
metadata = extract_metadata(
file_path=str(file_path),
chunks=["question chunk"],
page_numbers=[3],
strategy_type="question",
chunk_metadata=[{
"question_id": "A1",
"source_page_range": [3, 8],
}],
)
assert metadata[0]["page_number"] == 3
assert metadata[0]["source_page_range"] == [3, 8]
def test_chunk_metadata_length_mismatch(tmp_path):
"""chunk_metadata length mismatch with chunks raises ValueError."""
file_path = tmp_path / "test.pdf"
file_path.write_text("dummy content")
with pytest.raises(ValueError, match="chunk_metadata length"):
extract_metadata(
file_path=str(file_path),
chunks=["a", "b", "c"],
chunk_metadata=[{}, {}],
)
def test_chunk_metadata_empty_no_error(tmp_path):
"""Empty chunk_metadata list with matching chunks is valid."""
file_path = tmp_path / "test.pdf"
file_path.write_text("dummy content")
metadata = extract_metadata(
file_path=str(file_path),
chunks=["a"],
chunk_metadata=[],
)
assert len(metadata) == 1

View File

@ -0,0 +1,481 @@
"""Phase 8 tests: Q&A-pair chunking strategy (Sub-Phase 8.1).
Covers:
- LLM structure detection response parsing (parse_llm_structure_response)
- Mixed format handling (/ + section headings)
- Narrative-only text (no Q&A format)
- Speaking notes (發言要點) chunking by bullet
- Regex fast-pass for Chinese / format
- Regex fast-pass for English Q1/Q2 format
- Multi-page section tracking with [PAGE_BREAK] markers
- ChunkingStrategy ABC compliance
- Page number references question () page, not answer
- Size limit: oversized sections recursively split with heading preserved
- build_chunks_from_sections output verification
- preprocess_text: footer stripping, colon normalization, page break insertion
"""
import json
from typing import List, Tuple
from unittest.mock import AsyncMock, MagicMock
import pytest
from app.utils.qa_chunking import (
Section,
preprocess_text,
build_structure_detection_prompt,
parse_llm_structure_response,
split_chinese_qa,
split_english_qa,
build_chunks_from_sections,
)
from app.utils.chunking import (
ChunkingStrategy,
QuestionChunkingStrategy,
get_chunking_strategy,
)
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture
def mock_settings():
"""Minimal Settings mock with Q&A chunking defaults."""
s = MagicMock()
s.default_chunking_strategy = "question"
s.qa_vision_enabled = False
s.qa_max_chunk_tokens = 3000
s.qa_structure_model = ""
s.qa_include_internal_refs = True
s.qa_cache_vision_results = True
s.chunk_size = 1000
s.chunk_overlap = 200
s.llm_model_name = "test-model"
s.llm_api_key = "test-key"
s.llm_base_url = "https://example.com/v1"
s.llm_timeout = 30.0
s.llm_enable_thinking = False
s.vllm_engine = False
return s
SAMPLE_LLM_RESPONSE = json.dumps({
"sections": [
{
"type": "qa",
"heading": "(A) 排水系統",
"qa_id": "A1",
"question": "古洞北和粉嶺北新發展區的設計是否能抵禦氣候變化?",
"answer": "研究顧問已為古洞北和粉嶺北新發展區研究範圍進行了評估。",
"start_page": 2,
"end_page": 3,
"has_table": False,
"parent_topic": "排水系統",
},
{
"type": "narrative",
"heading": "(1) 住戶的安置補償",
"content": "合資格住戶可選擇(i)「須通過經濟狀況審查」安置安排。",
"start_page": 2,
"end_page": 5,
"has_table": False,
},
{
"type": "speaking_notes",
"heading": "發言要點",
"content": "⚫ 古洞北/粉嶺北新發展區是北部都會區內的新市鎮發展項目\n⚫ 第一期發展預計於2030年完成",
"start_page": 1,
"end_page": 2,
"has_table": False,
},
]
})
# ---------------------------------------------------------------------------
# Test: LLM structure detection parsing
# ---------------------------------------------------------------------------
class TestLLMStructureDetection:
def test_llm_structure_detection(self):
"""parse_llm_structure_response correctly parses LLM JSON with qa/narrative/speaking_notes."""
sections = parse_llm_structure_response(SAMPLE_LLM_RESPONSE)
assert len(sections) == 3
qa = sections[0]
assert qa.type == "qa"
assert qa.qa_id == "A1"
assert qa.question == "古洞北和粉嶺北新發展區的設計是否能抵禦氣候變化?"
assert qa.answer == "研究顧問已為古洞北和粉嶺北新發展區研究範圍進行了評估。"
assert qa.start_page == 2
assert qa.end_page == 3
assert qa.heading == "(A) 排水系統"
assert qa.parent_topic == "排水系統"
narr = sections[1]
assert narr.type == "narrative"
assert narr.heading == "(1) 住戶的安置補償"
assert "合資格住戶" in narr.content
notes = sections[2]
assert notes.type == "speaking_notes"
assert "" in notes.content
def test_llm_handles_mixed_formats(self):
"""Document with 問/答 markers + section headings correctly classified."""
mixed_json = json.dumps({
"sections": [
{
"type": "qa",
"heading": "(B) 交通",
"qa_id": "B1",
"question": "新建道路何時通車?",
"answer": "預計2027年通車。",
"start_page": 3,
"end_page": 4,
"has_table": False,
},
{
"type": "narrative",
"heading": "背景",
"content": "本文件說明交通規劃。",
"start_page": 1,
"end_page": 2,
"has_table": False,
},
]
})
sections = parse_llm_structure_response(mixed_json)
assert len(sections) == 2
assert sections[0].type == "qa"
assert sections[1].type == "narrative"
def test_llm_handles_no_qa_format(self):
"""Narrative-only text (like File L pages 1-13) produces only narrative sections."""
narrative_json = json.dumps({
"sections": [
{
"type": "narrative",
"heading": "Introduction",
"content": "This document provides background on policy matters.",
"start_page": 1,
"end_page": 5,
"has_table": False,
},
{
"type": "narrative",
"heading": "Analysis",
"content": "The analysis covers multiple dimensions.",
"start_page": 5,
"end_page": 13,
"has_table": False,
},
]
})
sections = parse_llm_structure_response(narrative_json)
assert len(sections) == 2
assert all(s.type == "narrative" for s in sections)
def test_llm_handles_speaking_notes(self):
"""發言要點 text with bullet points produces speaking_notes sections."""
notes_json = json.dumps({
"sections": [
{
"type": "speaking_notes",
"heading": "發言要點",
"content": "⚫ 要點一:政策方向\n⚫ 要點二:實施計劃\n⚫ 要點三:預算安排",
"start_page": 1,
"end_page": 2,
"has_table": False,
},
]
})
sections = parse_llm_structure_response(notes_json)
assert len(sections) == 1
assert sections[0].type == "speaking_notes"
assert sections[0].content.count("") == 3
def test_parse_markdown_fenced_json(self):
"""parse_llm_structure_response handles ```json ... ``` wrapped responses."""
fenced = '```json\n' + SAMPLE_LLM_RESPONSE + '\n```'
sections = parse_llm_structure_response(fenced)
assert len(sections) == 3
def test_parse_invalid_json_raises(self):
"""parse_llm_structure_response raises ValueError on non-JSON input."""
with pytest.raises(ValueError, match="Invalid JSON"):
parse_llm_structure_response("this is not json")
# ---------------------------------------------------------------------------
# Test: Regex fast-pass
# ---------------------------------------------------------------------------
class TestRegexFastPass:
def test_regex_fastpass_chinese(self):
"""Text with 問B1/答B1 markers detected by split_chinese_qa without LLM."""
text = (
"(A) 排水系統\n"
"問 B1古洞北的設計是否能抵禦氣候變化\n"
"答 B1研究顧問已為古洞北新發展區進行了評估。\n"
"問 B2第二個問題是什麼\n"
"答 B2這是第二個問題的答案。\n"
)
sections = split_chinese_qa(text)
assert len(sections) >= 2
# All should be QA type
assert all(s.type == "qa" for s in sections)
# First should have question containing 古洞北
assert "古洞北" in sections[0].question
def test_regex_fastpass_chinese_no_match(self):
"""split_chinese_qa returns empty list when no markers found."""
text = "This is plain text without any Q&A markers."
assert split_chinese_qa(text) == []
def test_regex_fastpass_english(self):
"""Text with Q1, Q2 markers detected by split_english_qa without LLM."""
text = (
"Background information here.\n\n"
"Q1 What is the timeline for the project?\n"
"The project is expected to complete by 2027.\n"
"Q2 How much will it cost?\n"
"The estimated cost is HK$500 million.\n"
)
sections = split_english_qa(text)
assert len(sections) >= 2
assert all(s.type == "qa" for s in sections)
assert any("timeline" in (s.question or "").lower() for s in sections)
def test_regex_fastpass_english_no_match(self):
"""split_english_qa returns empty list when no markers found."""
text = "純中文文本沒有英文問答標記。"
assert split_english_qa(text) == []
# ---------------------------------------------------------------------------
# Test: Multi-page tracking
# ---------------------------------------------------------------------------
class TestMultiPage:
def test_multi_page_sections(self):
"""Sections with [PAGE_BREAK: N] markers spanning pages track correctly."""
pages = [
(1, "Header line\n(A) Water drainage\nSome intro text"),
(2, "More drainage info\nFooter text X-1"),
(3, "New section begins\n(B) Traffic planning"),
]
text = preprocess_text(pages)
# Should have page break markers
assert "[PAGE_BREAK: 1]" in text
assert "[PAGE_BREAK: 2]" in text
assert "[PAGE_BREAK: 3]" in text
# ---------------------------------------------------------------------------
# Test: ABC contract
# ---------------------------------------------------------------------------
class TestABCContract:
def test_abc_contract(self):
"""QuestionChunkingStrategy satisfies ChunkingStrategy ABC."""
mock_settings = MagicMock()
mock_settings.qa_max_chunk_tokens = 3000
mock_settings.qa_include_internal_refs = True
strategy = QuestionChunkingStrategy(settings=mock_settings)
assert isinstance(strategy, ChunkingStrategy)
def test_get_chunking_strategy_factory(self, mock_settings):
"""get_chunking_strategy returns correct strategy type."""
token_strat = get_chunking_strategy("token", mock_settings)
assert isinstance(token_strat, ChunkingStrategy)
q_strat = get_chunking_strategy("question", mock_settings)
assert isinstance(q_strat, QuestionChunkingStrategy)
# ---------------------------------------------------------------------------
# Test: Page number reference
# ---------------------------------------------------------------------------
class TestPageNumberReference:
def test_page_number_reference_question(self):
"""Page ref in metadata points to question (問) page, not answer page."""
sections = [
Section(
type="qa",
heading="(A) Topic",
qa_id="A1",
question="What is X?",
answer="X is Y.",
start_page=5,
end_page=8,
),
]
chunks = build_chunks_from_sections(sections)
assert len(chunks) == 1
chunk_text, page_num, metadata = chunks[0]
# Page number should be start_page (question location)
assert page_num == 5
assert metadata.get("source_page_range") == [5, 8]
# ---------------------------------------------------------------------------
# Test: Size limit recursive split
# ---------------------------------------------------------------------------
class TestSizeLimit:
def test_size_limit(self):
"""Oversized QA section > 3000 tokens gets recursively split with question prepended."""
# Create a QA pair with a very long answer
long_answer = "\n\n".join(f"Paragraph {i}: " + "x" * 200 for i in range(80))
sections = [
Section(
type="qa",
heading="(A) Topic",
qa_id="A1",
question="What is the detailed plan?",
answer=long_answer,
start_page=2,
end_page=5,
has_table=False,
),
]
# Use a small max_tokens to force splitting
chunks = build_chunks_from_sections(sections, max_tokens=500)
assert len(chunks) > 1
# Each chunk should have the question text prepended
for chunk_text, page_num, metadata in chunks:
assert "What is the detailed plan?" in chunk_text
# Page number should always be the question page
assert page_num == 2
# ---------------------------------------------------------------------------
# Test: build_chunks_from_sections
# ---------------------------------------------------------------------------
class TestBuildChunksFromSections:
def test_build_chunks_from_sections(self):
"""Verify chunk texts and metadata from sections list."""
sections = [
Section(
type="qa",
heading="(A) 排水系統",
qa_id="A1",
question="古洞北的設計是否能抵禦氣候變化?",
answer="研究顧問已為古洞北進行了評估。",
start_page=2,
end_page=3,
has_table=True,
parent_topic="排水系統",
),
Section(
type="narrative",
heading="(1) 住戶的安置補償",
content="合資格住戶可選擇安置安排。",
start_page=3,
end_page=5,
has_table=False,
),
Section(
type="speaking_notes",
heading="發言要點",
content="⚫ 要點一:政策方向\n⚫ 要點二:實施計劃",
start_page=1,
end_page=1,
has_table=False,
),
Section(
type="toc",
heading="目錄",
content="Page 1 ... Page 2",
start_page=1,
end_page=1,
has_table=False,
),
]
chunks = build_chunks_from_sections(sections)
# Should have: 1 QA + 1 narrative + 2 speaking_notes bullets + 0 toc = 4
assert len(chunks) >= 4
# First chunk: QA
qa_text, qa_page, qa_meta = chunks[0]
assert "古洞北" in qa_text
assert qa_page == 2
assert qa_meta["section_type"] == "qa"
assert qa_meta["question_id"] == "A1"
assert qa_meta["question_index"] == 0
assert qa_meta["answer_contains_table"] is True
assert qa_meta["section_heading"] == "(A) 排水系統"
# Find the narrative chunk
narr_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "narrative"]
assert len(narr_chunks) == 1
narr_text, narr_page, narr_meta = narr_chunks[0]
assert "住戶的安置補償" in narr_text
assert "合資格住戶" in narr_text
# Find speaking_notes chunks
notes_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "speaking_notes"]
assert len(notes_chunks) == 2
for t, p, m in notes_chunks:
assert "要點" in t
# No TOC chunks
toc_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "toc"]
assert len(toc_chunks) == 0
# ---------------------------------------------------------------------------
# Test: preprocess_text
# ---------------------------------------------------------------------------
class TestPreprocessText:
def test_preprocess_text(self):
"""Footer markers stripped, colons normalized, page breaks inserted."""
pages = [
(1, "Header\n(A) Section Title\nX-1\n2024-01-15"),
(2, "Content withfullwidth colon\nMore texthere"),
]
result = preprocess_text(pages)
# Should have page break markers
assert "[PAGE_BREAK: 1]" in result
assert "[PAGE_BREAK: 2]" in result
# Fullwidth colons normalized to ASCII
assert "" not in result
assert ":" in result
# Page footer patterns should be stripped (X-1, dates like 2024-01-15)
assert "X-1" not in result
assert "2024-01-15" not in result
# ---------------------------------------------------------------------------
# Test: build_structure_detection_prompt
# ---------------------------------------------------------------------------
class TestBuildPrompt:
def test_build_structure_detection_prompt(self):
"""Prompt contains key instructions for LLM classification."""
text = "Sample document text [PAGE_BREAK: 1]"
prompt = build_structure_detection_prompt(text)
assert "Hong Kong Legislative Council" in prompt or "Legislative Council" in prompt
assert "qa" in prompt.lower() or "" in prompt
assert "narrative" in prompt.lower()
assert "speaking_notes" in prompt.lower() or "speaking notes" in prompt.lower()
assert text in prompt

View File

@ -6,8 +6,15 @@ token-based windows.
""" """
from __future__ import annotations from __future__ import annotations
import logging
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List, Tuple from typing import TYPE_CHECKING, List, Optional, Tuple
if TYPE_CHECKING:
from app.core.config import Settings
from app.services.llm_client import LLMClient
logger = logging.getLogger(__name__)
class ChunkingStrategy(ABC): class ChunkingStrategy(ABC):
@ -117,3 +124,111 @@ class TokenChunkingStrategy(ChunkingStrategy):
results.append(("\n".join(parts), page_num)) results.append(("\n".join(parts), page_num))
return results return results
class QuestionChunkingStrategy(ChunkingStrategy):
"""Chunk text by detecting Q&A structure using LLM and/or regex patterns.
Designed for LegCo documents with explicit / or Q1/Q2 markers.
Falls back to section-based chunking for narrative-only documents.
"""
def __init__(
self,
settings: "Settings",
llm_client: Optional["LLMClient"] = None,
):
self._settings = settings
self._llm_client = llm_client
self._max_tokens = getattr(settings, "qa_max_chunk_tokens", 3000)
self._chunk_metadata: List[dict] = []
def chunk(self, text: str) -> List[str]:
"""Split text into chunks using Q&A detection (for DOCX/TXT)."""
if not text or not text.strip():
return []
from app.utils.qa_chunking import (
split_chinese_qa,
split_english_qa,
build_chunks_from_sections,
Section,
)
sections = split_chinese_qa(text)
if not sections:
sections = split_english_qa(text)
if not sections:
sections = [Section(type="narrative", content=text, start_page=1, end_page=1)]
results = build_chunks_from_sections(sections, max_tokens=self._max_tokens)
self._chunk_metadata = [meta for _, _, meta in results]
return [chunk_text for chunk_text, _, _ in results]
def chunk_pages(
self, pages: List[Tuple[int, str]], overlap_tokens: int = 0
) -> List[Tuple[str, int]]:
"""Split page-segmented text using Q&A detection (for PDF).
Returns list of (chunk_text, page_number) where page_number
references the question location for Q&A chunks.
"""
if not pages:
return []
from app.utils.qa_chunking import (
preprocess_text,
split_chinese_qa,
split_english_qa,
build_chunks_from_sections,
parse_llm_structure_response,
build_structure_detection_prompt,
Section,
)
full_text = preprocess_text(pages)
sections = split_chinese_qa(full_text)
if not sections:
sections = split_english_qa(full_text)
if not sections and self._llm_client is not None:
import asyncio
prompt = build_structure_detection_prompt(full_text)
try:
loop = asyncio.get_event_loop()
if loop.is_running():
sections = []
else:
response = loop.run_until_complete(
self._llm_client.complete(prompt, temperature=0.3, step_name="StructureDetection")
)
sections = parse_llm_structure_response(response)
except Exception:
logger.warning("LLM structure detection failed, using fallback", exc_info=True)
if not sections:
sections = [Section(type="narrative", content=full_text, start_page=1, end_page=len(pages))]
results = build_chunks_from_sections(sections, max_tokens=self._max_tokens)
self._chunk_metadata = [meta for _, _, meta in results]
return [(chunk_text, page_num) for chunk_text, page_num, _ in results]
def get_chunking_strategy(name: str, settings: "Settings") -> ChunkingStrategy:
"""Factory: return the named chunking strategy.
Args:
name: "token" or "question"
settings: Application settings instance.
Returns:
ChunkingStrategy instance.
"""
if name == "question":
return QuestionChunkingStrategy(settings=settings)
return TokenChunkingStrategy(
chunk_size=settings.chunk_size,
overlap=settings.chunk_overlap,
)

View File

@ -12,6 +12,8 @@ def extract_metadata(
page_numbers: List[int | None] | None = None, page_numbers: List[int | None] | None = None,
chunk_file_paths: List[str | None] | None = None, chunk_file_paths: List[str | None] | None = None,
document_id: str | None = None, document_id: str | None = None,
strategy_type: str = "token",
chunk_metadata: List[Dict[str, Any]] | None = None,
) -> List[Dict[str, Any]]: ) -> List[Dict[str, Any]]:
"""Extract metadata for a list of text chunks. """Extract metadata for a list of text chunks.
@ -23,6 +25,10 @@ def extract_metadata(
- chunk_file_path: path to the per-chunk source file - chunk_file_path: path to the per-chunk source file
- document_id: unique identifier linking all chunks to the same document - document_id: unique identifier linking all chunks to the same document
Package 8 Q&A fields (present when chunk_metadata provided):
- strategy_type, section_type, question_index, question_id, question_text,
section_heading, answer_contains_table, source_page_range, parent_topic
Args: Args:
file_path: Path to the file associated with the chunks. file_path: Path to the file associated with the chunks.
chunks: List of string chunks to generate metadata for. chunks: List of string chunks to generate metadata for.
@ -31,6 +37,12 @@ def extract_metadata(
page_numbers: Optional per-chunk page numbers. Length must match chunks. page_numbers: Optional per-chunk page numbers. Length must match chunks.
chunk_file_paths: Optional per-chunk source file paths. Length must match chunks. chunk_file_paths: Optional per-chunk source file paths. Length must match chunks.
document_id: Optional unique document identifier applied to all chunks. document_id: Optional unique document identifier applied to all chunks.
strategy_type: Chunking strategy used ("token" or "question"). Stored in
each chunk's metadata.
chunk_metadata: Optional per-chunk metadata dicts from Q&A strategy.
Each dict is merged into the corresponding base metadata entry.
Length must match chunks. Fields like question_id, question_index,
section_type, etc. are forwarded to ChromaDB metadata.
Returns: Returns:
A list of metadata dictionaries, one per chunk. Empty list if chunks is empty. A list of metadata dictionaries, one per chunk. Empty list if chunks is empty.
@ -55,6 +67,11 @@ def extract_metadata(
f"chunk_file_paths length ({len(chunk_file_paths)}) does not match chunks length ({len(chunks)})" f"chunk_file_paths length ({len(chunk_file_paths)}) does not match chunks length ({len(chunks)})"
) )
if chunk_metadata is not None and len(chunk_metadata) > 0 and len(chunk_metadata) != len(chunks):
raise ValueError(
f"chunk_metadata length ({len(chunk_metadata)}) does not match chunks length ({len(chunks)})"
)
filename = original_filename if original_filename else os.path.basename(file_path) filename = original_filename if original_filename else os.path.basename(file_path)
upload_date = datetime.now().isoformat() upload_date = datetime.now().isoformat()
@ -68,6 +85,7 @@ def extract_metadata(
"content_summary": content_summary, "content_summary": content_summary,
"chunk_index": idx, "chunk_index": idx,
"document_id": document_id, "document_id": document_id,
"strategy_type": strategy_type,
} }
page_num = page_numbers[idx] if page_numbers else None page_num = page_numbers[idx] if page_numbers else None
if page_num is not None: if page_num is not None:
@ -75,6 +93,8 @@ def extract_metadata(
cfp = chunk_file_paths[idx] if chunk_file_paths else None cfp = chunk_file_paths[idx] if chunk_file_paths else None
if cfp is not None: if cfp is not None:
entry["chunk_file_path"] = cfp entry["chunk_file_path"] = cfp
if chunk_metadata:
entry.update(chunk_metadata[idx])
metadata.append(entry) metadata.append(entry)
return metadata return metadata

View File

@ -0,0 +1,361 @@
"""Q&A-pair chunking utilities for Package 8.
Provides section detection (LLM + regex), text preprocessing,
and chunk building for LegCo documents with Q&A structure.
"""
from __future__ import annotations
import json
import logging
import re
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
@dataclass
class Section:
"""A detected section within a LegCo document."""
type: str # "qa" | "narrative" | "speaking_notes" | "table" | "toc" | "heading_only"
heading: str = ""
qa_id: Optional[str] = None
question: Optional[str] = None
answer: Optional[str] = None
content: str = ""
start_page: int = 1
end_page: int = 1
has_table: bool = False
parent_topic: str = ""
_FOOTER_RE = re.compile(r"^[A-Z]-\d+\s*$", re.MULTILINE)
_FOOTER_DATE_RE = re.compile(r"^[A-Z]-\d+\s*\n\d{4}-\d{2}-\d{2}$", re.MULTILINE)
_HEADER_LETTER_RE = re.compile(r"^(\([A-Z]\))\s*$", re.MULTILINE)
_FULLWIDTH_COLON_RE = re.compile("[]")
def preprocess_text(pages: List[Tuple[int, str]]) -> str:
"""Concatenate pages, strip footers/headers, normalize colons, insert [PAGE_BREAK: N] markers."""
parts: List[str] = []
for idx, (page_num, page_text) in enumerate(pages):
text = _FOOTER_DATE_RE.sub("", page_text)
text = _FOOTER_RE.sub("", text)
if idx > 0:
text = _HEADER_LETTER_RE.sub("", text)
text = _FULLWIDTH_COLON_RE.sub(":", text)
parts.append(f"[PAGE_BREAK: {page_num}]\n{text}")
return "\n".join(parts)
_STRUCTURE_PROMPT_TEMPLATE = """You are analyzing a Hong Kong Legislative Council document.
The text has page markers like [PAGE_BREAK: N] showing where pages begin.
For each distinct section in this document, identify:
1. The section type:
- "qa": a question-and-answer pair (/ or Q1/Q2 format)
- "narrative": policy text, explanatory paragraphs, section content with bullets
- "speaking_notes": briefing points (發言要點) with bullet markers
- "table": standalone data tables (not embedded in answers)
- "toc": table of contents
- "heading_only": a section heading with no following content
2. For "qa" sections:
- The question text (exact)
- The answer text (exact, including tables, bullet lists, and [內部參考] content)
- The question ID if present (e.g. "A1", "Q3")
- The start page and end page
3. For all sections:
- The section heading (e.g. "(A) 排水系統", "(1) 住戶的安置補償")
- The start page and end page
- Whether the section contains tables
Return JSON:
{{
"sections": [
{{
"type": "qa",
"heading": "(A) 排水系統",
"qa_id": "A1",
"question": "...",
"answer": "...",
"start_page": 2,
"end_page": 3,
"has_table": true,
"parent_topic": "排水系統"
}},
{{
"type": "narrative",
"heading": "(1) 住戶的安置補償",
"content": "...",
"start_page": 2,
"end_page": 5,
"has_table": false
}}
]
}}
DOCUMENT TEXT:
{document_text}"""
def build_structure_detection_prompt(text: str) -> str:
"""Construct the LLM prompt for section classification."""
return _STRUCTURE_PROMPT_TEMPLATE.format(document_text=text)
_MARKDOWN_FENCE_RE = re.compile(r"```(?:json)?\s*\n?(.*?)\n?```", re.DOTALL)
def parse_llm_structure_response(response_text: str) -> List[Section]:
"""Parse the JSON returned by the LLM. Handle markdown code fences.
Raises ValueError if response is not valid JSON.
"""
cleaned = response_text.strip()
fence_match = _MARKDOWN_FENCE_RE.search(cleaned)
if fence_match:
cleaned = fence_match.group(1).strip()
try:
data = json.loads(cleaned)
except json.JSONDecodeError as exc:
raise ValueError(f"Invalid JSON from LLM structure detection: {exc}") from exc
sections_raw = data.get("sections", [])
sections: List[Section] = []
for raw in sections_raw:
sections.append(Section(
type=raw.get("type", "narrative"),
heading=raw.get("heading", ""),
qa_id=raw.get("qa_id"),
question=raw.get("question"),
answer=raw.get("answer"),
content=raw.get("content", ""),
start_page=raw.get("start_page", 1),
end_page=raw.get("end_page", 1),
has_table=raw.get("has_table", False),
parent_topic=raw.get("parent_topic", ""),
))
return sections
_CN_QA_RE = re.compile(
r"\s*([A-Z]\d+)\s*[:]\s*(.*?)\s*"
r"(?:\n\s*答\s*\1\s*[:]\s*(.*?)\s*)"
r"(?=\n\s*(?:問\s*[A-Z]\d+\s*[:]|$))",
re.DOTALL,
)
def split_chinese_qa(text: str) -> List[Section]:
"""Regex fast-pass for 問/答 format. Returns empty list if no matches found."""
sections: List[Section] = []
for m in _CN_QA_RE.finditer(text):
qa_id = m.group(1)
question = m.group(2).strip()
answer = (m.group(3) or "").strip()
sections.append(Section(
type="qa",
qa_id=qa_id,
question=question,
answer=answer,
))
return sections
_EN_QA_RE = re.compile(
r"^(Q\d+)\s+(.*?)\s*$\n((?:(?!^Q\d+).+(?:\n|$))*)",
re.MULTILINE,
)
def split_english_qa(text: str) -> List[Section]:
"""Regex fast-pass for Q-number format. Returns empty list if no matches found."""
sections: List[Section] = []
for m in _EN_QA_RE.finditer(text):
qa_id = m.group(1)
question = m.group(2).strip()
answer = m.group(3).strip()
sections.append(Section(
type="qa",
qa_id=qa_id,
question=question,
answer=answer,
))
return sections
def _estimate_tokens(text: str) -> int:
"""Rough token estimate: ~1.3 tokens per CJK char, ~1 token per 4 chars for Latin."""
cjk_count = 0
latin_len = 0
for ch in text:
if "\u4e00" <= ch <= "\u9fff":
cjk_count += 1
else:
latin_len += 1
return int(cjk_count * 1.3 + latin_len / 4)
def _split_oversized_qa(
question: str, answer: str, page: int, heading: str,
qa_id: Optional[str], question_index: int, has_table: bool,
parent_topic: str, start_page: int, end_page: int,
max_tokens: int,
) -> List[Tuple[str, int, dict]]:
"""Recursively split an oversized Q&A answer with question prepended to each sub-chunk."""
# Try paragraph boundaries first
parts = answer.split("\n\n")
if len(parts) <= 1:
parts = answer.split("\n")
# Group parts into sub-chunks that fit within max_tokens
sub_chunks: List[str] = []
current = ""
for part in parts:
candidate = (current + "\n\n" + part) if current else part
if _estimate_tokens(f"Question: {question}\n\nAnswer (part 1/N): {candidate}") > max_tokens and current:
sub_chunks.append(current)
current = part
else:
current = candidate
if current:
sub_chunks.append(current)
total = len(sub_chunks)
results: List[Tuple[str, int, dict]] = []
for i, sub in enumerate(sub_chunks):
chunk_text = f"Question: {question}\n\nAnswer (part {i + 1}/{total}): {sub}"
meta = {
"strategy_type": "question",
"section_type": "qa",
"question_index": question_index,
"question_id": qa_id,
"question_text": question,
"section_heading": heading,
"answer_contains_table": has_table,
"source_page_range": [start_page, end_page],
"parent_topic": parent_topic,
}
results.append((chunk_text, page, meta))
return results
def build_chunks_from_sections(
sections: List[Section], max_tokens: int = 3000,
) -> List[Tuple[str, int, dict]]:
"""Build chunk texts + page refs + metadata from sections.
Returns List[(chunk_text, page_number, metadata_dict)].
"""
chunks: List[Tuple[str, int, dict]] = []
qa_index = 0
for section in sections:
if section.type in ("toc", "heading_only"):
continue
if section.type == "qa":
question_text = section.question or ""
answer_text = section.answer or ""
chunk_text = f"Question: {question_text}\n\nAnswer: {answer_text}"
if section.heading:
chunk_text = f"[{section.heading}]\n{chunk_text}"
page = section.start_page
meta: Dict = {
"strategy_type": "question",
"section_type": "qa",
"question_index": qa_index,
"question_id": section.qa_id,
"question_text": question_text,
"section_heading": section.heading,
"answer_contains_table": section.has_table,
"source_page_range": [section.start_page, section.end_page],
"parent_topic": section.parent_topic,
}
if _estimate_tokens(chunk_text) > max_tokens:
chunks.extend(_split_oversized_qa(
question=question_text,
answer=answer_text,
page=page,
heading=section.heading,
qa_id=section.qa_id,
question_index=qa_index,
has_table=section.has_table,
parent_topic=section.parent_topic,
start_page=section.start_page,
end_page=section.end_page,
max_tokens=max_tokens,
))
else:
chunks.append((chunk_text, page, meta))
qa_index += 1
elif section.type == "narrative":
content = section.content
if not content.strip():
continue
prefix = f"[{section.heading}]\n" if section.heading else ""
chunk_text = f"{prefix}{content}"
meta = {
"strategy_type": "question",
"section_type": "narrative",
"section_heading": section.heading,
"source_page_range": [section.start_page, section.end_page],
}
if _estimate_tokens(chunk_text) <= max_tokens:
chunks.append((chunk_text, section.start_page, meta))
else:
paragraphs = content.split("\n\n")
current = ""
for para in paragraphs:
candidate = (current + "\n\n" + para) if current else para
full = f"{prefix}{candidate}"
if _estimate_tokens(full) > max_tokens and current:
chunks.append((f"{prefix}{current}", section.start_page, dict(meta)))
current = para
else:
current = candidate
if current:
chunks.append((f"{prefix}{current}", section.start_page, dict(meta)))
elif section.type == "speaking_notes":
content = section.content
if not content.strip():
continue
bullets = re.split(r"(?=⚫)", content)
bullets = [b.strip() for b in bullets if b.strip()]
if not bullets:
bullets = [content]
prefix = f"[{section.heading}]\n" if section.heading else ""
for bullet in bullets:
chunk_text = f"{prefix}{bullet}"
meta = {
"strategy_type": "question",
"section_type": "speaking_notes",
"section_heading": section.heading,
"source_page_range": [section.start_page, section.end_page],
}
chunks.append((chunk_text, section.start_page, meta))
elif section.type == "table":
content = section.content
if not content.strip():
continue
chunk_text = f"[{section.heading}]\n{content}" if section.heading else content
meta = {
"strategy_type": "question",
"section_type": "table",
"section_heading": section.heading,
"answer_contains_table": True,
"source_page_range": [section.start_page, section.end_page],
}
chunks.append((chunk_text, section.start_page, meta))
return chunks

View File

@ -0,0 +1,147 @@
"""Table extraction utilities for Package 8.
Provides vision-based and text-based table detection and markdown conversion
for LegCo documents. Uses the existing LLM model (vision-capable) for
table-to-markdown conversion.
"""
from __future__ import annotations
import hashlib
import json
import logging
import os
from pathlib import Path
from typing import List, Optional
logger = logging.getLogger(__name__)
_CACHE_DIR = Path(__file__).resolve().parent.parent.parent / ".cache" / "vision_tables"
async def extract_tables_vision(page_images: List[str], llm_client) -> List[str]:
"""Send page images to vision LLM, get back markdown tables.
Each page_image is a base64-encoded PNG string.
Uses the existing LLM model which supports vision input.
"""
results: List[str] = []
prompt = (
"Convert this page to Markdown. For any tables:\n"
"- Use proper markdown table syntax with |---|---| alignment\n"
"- Preserve all column headers and row labels\n"
"- Do not modify or translate the content\n"
"- If a table spans multiple pages, note it"
)
for idx, img_b64 in enumerate(page_images):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{img_b64}"},
},
],
}
]
try:
response = await llm_client._client.chat.completions.create(
model=llm_client.model,
messages=messages,
temperature=0.1,
)
content = response.choices[0].message.content or ""
if content.strip():
results.append(content.strip())
except Exception:
logger.warning("Vision table extraction failed for page image %d", idx, exc_info=True)
return results
_TABLE_HEURISTIC_RE = [
r"(?:\|[\s\-:]+\|)",
r"(?:\+[-=]+\+)",
r"(?:(?:\S+\s{2,}){3,}\n)",
]
_TABLE_REGION_PROMPT = (
"Convert this raw table text extracted from a PDF into a markdown table.\n"
"Preserve all data exactly. Detect column boundaries and alignment.\n\n"
"{table_text}"
)
async def extract_tables_text(text: str, llm_client) -> List[str]:
"""Detect table-like text regions, send to LLM for markdown conversion."""
import re
regions: List[str] = []
lines = text.split("\n")
current_region: List[str] = []
in_table = False
for line in lines:
is_table_line = any(re.search(pat, line) for pat in _TABLE_HEURISTIC_RE)
if is_table_line:
in_table = True
current_region.append(line)
elif in_table and line.strip():
current_region.append(line)
else:
if len(current_region) >= 3:
regions.append("\n".join(current_region))
current_region = []
in_table = False
if len(current_region) >= 3:
regions.append("\n".join(current_region))
if not regions:
return []
results: List[str] = []
for region in regions:
prompt = _TABLE_REGION_PROMPT.format(table_text=region)
try:
response = await llm_client.complete(prompt, temperature=0.1, step_name="TableExtraction")
if response.strip():
results.append(response.strip())
except Exception:
logger.warning("Text-based table extraction failed", exc_info=True)
return results
def inject_tables_into_answer(answer: str, tables_md: List[str]) -> str:
"""Replace raw table text regions in answer with markdown tables."""
if not tables_md:
return answer
result = answer
for table_md in tables_md:
lines = table_md.split("\n")
if not lines:
continue
header_line = lines[0]
if header_line.strip() in result:
result = result.replace(header_line.strip(), table_md)
return result
def cache_vision_result(page_hash: str) -> Optional[str]:
"""Simple disk cache: hash→markdown stored in .cache dir. Returns None on miss."""
cache_file = _CACHE_DIR / f"{page_hash}.md"
if cache_file.exists():
return cache_file.read_text(encoding="utf-8")
return None
def save_vision_result(page_hash: str, markdown: str) -> None:
"""Save a vision result to the disk cache."""
_CACHE_DIR.mkdir(parents=True, exist_ok=True)
cache_file = _CACHE_DIR / f"{page_hash}.md"
cache_file.write_text(markdown, encoding="utf-8")
def compute_page_hash(page_image_b64: str) -> str:
"""Compute a hash for a page image for cache key purposes."""
return hashlib.sha256(page_image_b64.encode("utf-8")).hexdigest()[:16]

View File

@ -56,9 +56,32 @@ export const ChunkList: React.FC<ChunkListProps> = ({
<span className="text-xs font-medium text-gray-500 uppercase"> <span className="text-xs font-medium text-gray-500 uppercase">
Chunk {chunk.chunk_index} Chunk {chunk.chunk_index}
</span> </span>
{chunk.strategy_type === 'question' && chunk.question_id ? (
<>
<span className="text-xs text-gray-600">
Q: {chunk.question_id}{chunk.question_text ? `${chunk.question_text.length > 80 ? `${chunk.question_text.slice(0, 80)}...` : chunk.question_text}` : ''}
</span>
{chunk.topic_section && (
<span className="text-xs text-gray-500">
Topic: {chunk.topic_section}
</span>
)}
{chunk.source_page_range && chunk.source_page_range.length === 2 && (
<span className="text-xs text-gray-400">
Pages {chunk.source_page_range[0]}-{chunk.source_page_range[1]}
</span>
)}
{chunk.has_table && (
<span className="inline-flex items-center px-1.5 py-0.5 rounded text-xs font-medium bg-amber-100 text-amber-700">
Contains table
</span>
)}
</>
) : (
<span className="text-xs text-gray-400"> <span className="text-xs text-gray-400">
Page: {chunk.page_number !== null ? chunk.page_number : 'N/A'} Page: {chunk.page_number !== null ? chunk.page_number : 'N/A'}
</span> </span>
)}
</div> </div>
<div className="text-sm text-gray-700 truncate" title={chunk.content_summary}> <div className="text-sm text-gray-700 truncate" title={chunk.content_summary}>
{chunk.content_summary.length > 100 {chunk.content_summary.length > 100
@ -67,7 +90,12 @@ export const ChunkList: React.FC<ChunkListProps> = ({
</div> </div>
{chunk.chunk_file_path && ( {chunk.chunk_file_path && (
<a <a
href={getPdfViewerUrl(chunk.chunk_file_path, chunk.page_number ?? undefined)} href={getPdfViewerUrl(
chunk.chunk_file_path,
chunk.strategy_type === 'question' && chunk.source_page_range && chunk.source_page_range.length > 0
? chunk.source_page_range[0]
: chunk.page_number ?? undefined
)}
target="_blank" target="_blank"
rel="noopener noreferrer" rel="noopener noreferrer"
className="inline-flex items-center mt-1 text-xs text-blue-600 hover:text-blue-800 hover:underline" className="inline-flex items-center mt-1 text-xs text-blue-600 hover:text-blue-800 hover:underline"

View File

@ -29,7 +29,18 @@ export const DocumentList: React.FC<DocumentListProps> = ({
<div className="flex items-center space-x-3 flex-1"> <div className="flex items-center space-x-3 flex-1">
<FileText className="w-5 h-5 text-gray-500 flex-shrink-0" /> <FileText className="w-5 h-5 text-gray-500 flex-shrink-0" />
<div className="flex-1 min-w-0"> <div className="flex-1 min-w-0">
<div className="font-medium text-gray-900 truncate">{doc.filename}</div> <div className="flex items-center space-x-2">
<span className="font-medium text-gray-900 truncate">{doc.filename}</span>
{doc.chunking_strategy === 'question' ? (
<span className="inline-flex items-center px-2 py-0.5 rounded text-xs font-medium bg-blue-100 text-blue-700">
chunked by question
</span>
) : (
<span className="inline-flex items-center px-2 py-0.5 rounded text-xs font-medium bg-gray-100 text-gray-700">
chunked by token
</span>
)}
</div>
<div className="text-sm text-gray-500"> <div className="text-sm text-gray-500">
{doc.chunk_count} chunks Uploaded {doc.upload_date} {doc.chunk_count} chunks Uploaded {doc.upload_date}
</div> </div>

View File

@ -1,5 +1,5 @@
import axios from 'axios' import axios from 'axios'
import type { QueryRequest, QueryResponse, QueryStreamEvent, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, FullTranscriptResponse, VideoUploadResponse } from '../types' import type { ChunkingStrategy, QueryRequest, QueryResponse, QueryStreamEvent, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, FullTranscriptResponse, VideoUploadResponse } from '../types'
const BASE_URL: string = import.meta.env.VITE_API_BASE_URL ?? 'http://localhost:8000/api/v1' const BASE_URL: string = import.meta.env.VITE_API_BASE_URL ?? 'http://localhost:8000/api/v1'
@ -48,10 +48,10 @@ export const queryDocumentStream = async (
} }
} }
export const ingestDocument = async (file: File): Promise<IngestResponse> => { export const ingestDocument = async (file: File, strategy: ChunkingStrategy = 'token'): Promise<IngestResponse> => {
const form = new FormData() const form = new FormData()
form.append('file', file) form.append('file', file)
const resp = await apiClient.post<IngestResponse>('/ingest', form, { const resp = await apiClient.post<IngestResponse>(`/ingest?strategy=${strategy}`, form, {
headers: { 'Content-Type': 'multipart/form-data' }, headers: { 'Content-Type': 'multipart/form-data' },
}) })
return resp.data return resp.data

View File

@ -1,7 +1,7 @@
import React from 'react' import React from 'react'
import { QueryClient, QueryClientProvider, useMutation, useQuery, useQueryClient } from '@tanstack/react-query' import { QueryClient, QueryClientProvider, useMutation, useQuery, useQueryClient } from '@tanstack/react-query'
import { queryDocument, queryDocumentStream, ingestDocument, listDocuments, listChunks, deleteDocument, deleteChunk, listPromptProfiles, getPromptProfile, activatePromptProfile, updatePrompt, updateAllPrompts, resetPrompts, exportProfile, importProfile, listQueryHistory, getQueryHistoryDetail, deleteQueryHistory, clearQueryHistory, getHistoryStats, uploadVideo } from './api' import { queryDocument, queryDocumentStream, ingestDocument, listDocuments, listChunks, deleteDocument, deleteChunk, listPromptProfiles, getPromptProfile, activatePromptProfile, updatePrompt, updateAllPrompts, resetPrompts, exportProfile, importProfile, listQueryHistory, getQueryHistoryDetail, deleteQueryHistory, clearQueryHistory, getHistoryStats, uploadVideo } from './api'
import type { QueryRequest, QueryResponse, QueryStreamEvent, SourceMetadata, SubQuestionSources, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, VideoUploadResponse } from '../types' import type { QueryRequest, QueryResponse, QueryStreamEvent, SourceMetadata, SubQuestionSources, ChunkingStrategy, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, VideoUploadResponse } from '../types'
import { useState, useCallback, useRef } from 'react' import { useState, useCallback, useRef } from 'react'
export const queryClient = new QueryClient() export const queryClient = new QueryClient()
@ -185,8 +185,8 @@ export const useQueryDocumentStream = () => {
} }
export const useIngestDocument = () => { export const useIngestDocument = () => {
return useMutation<IngestResponse, Error, File>({ return useMutation<IngestResponse, Error, { file: File; strategy: ChunkingStrategy }>({
mutationFn: ingestDocument, mutationFn: ({ file, strategy }) => ingestDocument(file, strategy),
}) })
} }

View File

@ -1,10 +1,11 @@
import React, { useState, useCallback, useMemo } from 'react' import React, { useState, useCallback, useMemo } from 'react'
import { Database, AlertCircle, CheckCircle, XCircle, Loader2 } from 'lucide-react' import { Database, AlertCircle, CheckCircle, XCircle, Loader2, Type, MessageSquare } from 'lucide-react'
import { useQueryClient } from '@tanstack/react-query' import { useQueryClient } from '@tanstack/react-query'
import { useDocuments, useDocumentChunks, useDeleteDocument, useDeleteChunk, useIngestDocument } from '../lib/queries' import { useDocuments, useDocumentChunks, useDeleteDocument, useDeleteChunk, useIngestDocument } from '../lib/queries'
import { DocumentList } from '../components/DocumentList' import { DocumentList } from '../components/DocumentList'
import { ChunkList } from '../components/ChunkList' import { ChunkList } from '../components/ChunkList'
import { DocumentUpload } from '../components/DocumentUpload' import { DocumentUpload } from '../components/DocumentUpload'
import type { ChunkingStrategy } from '../types'
interface FileUploadEntry { interface FileUploadEntry {
name: string name: string
@ -22,6 +23,7 @@ export const RAGDatabasePage: React.FC = () => {
const initialDocId = useMemo(() => getDocumentIdFromUrl(), []) const initialDocId = useMemo(() => getDocumentIdFromUrl(), [])
const [expandedId, setExpandedId] = useState<string | null>(initialDocId) const [expandedId, setExpandedId] = useState<string | null>(initialDocId)
const [uploadEntries, setUploadEntries] = useState<FileUploadEntry[]>([]) const [uploadEntries, setUploadEntries] = useState<FileUploadEntry[]>([])
const [chunkingStrategy, setChunkingStrategy] = useState<ChunkingStrategy>('token')
const { data: documentsData, isLoading: isLoadingDocuments, error: documentsError } = useDocuments() const { data: documentsData, isLoading: isLoadingDocuments, error: documentsError } = useDocuments()
const { data: chunks, isLoading: isLoadingChunks } = useDocumentChunks(expandedId) const { data: chunks, isLoading: isLoadingChunks } = useDocumentChunks(expandedId)
@ -60,7 +62,7 @@ export const RAGDatabasePage: React.FC = () => {
const results = await Promise.allSettled( const results = await Promise.allSettled(
files.map(async (file) => { files.map(async (file) => {
try { try {
await ingestDocumentMutation.mutateAsync(file) await ingestDocumentMutation.mutateAsync({ file, strategy: chunkingStrategy })
setUploadEntries((prev) => setUploadEntries((prev) =>
prev.map((e) => prev.map((e) =>
e.name === file.name ? { ...e, status: 'success' as const } : e e.name === file.name ? { ...e, status: 'success' as const } : e
@ -80,7 +82,7 @@ export const RAGDatabasePage: React.FC = () => {
queryClient.invalidateQueries({ queryKey: ['documents'] }) queryClient.invalidateQueries({ queryKey: ['documents'] })
setTimeout(() => setUploadEntries([]), 5000) setTimeout(() => setUploadEntries([]), 5000)
}, [ingestDocumentMutation, queryClient]) }, [ingestDocumentMutation, queryClient, chunkingStrategy])
const uploadingCount = uploadEntries.filter((e) => e.status === 'uploading').length const uploadingCount = uploadEntries.filter((e) => e.status === 'uploading').length
const successCount = uploadEntries.filter((e) => e.status === 'success').length const successCount = uploadEntries.filter((e) => e.status === 'success').length
@ -127,6 +129,42 @@ export const RAGDatabasePage: React.FC = () => {
/> />
</div> </div>
<div className="mt-3 flex items-center space-x-4">
<span className="text-sm font-medium text-gray-700">Chunking strategy:</span>
<div className="flex items-center space-x-3">
<label className="flex items-center space-x-2 cursor-pointer">
<input
type="radio"
name="chunking-strategy"
value="token"
checked={chunkingStrategy === 'token'}
onChange={() => setChunkingStrategy('token')}
className="h-4 w-4 text-blue-600 border-gray-300 focus:ring-blue-500"
/>
<Type className="w-4 h-4 text-gray-500" />
<div>
<span className="text-sm font-medium text-gray-900">Chunk by Token</span>
<span className="text-xs text-gray-500 ml-1">Split by token windows with overlap</span>
</div>
</label>
<label className="flex items-center space-x-2 cursor-pointer">
<input
type="radio"
name="chunking-strategy"
value="question"
checked={chunkingStrategy === 'question'}
onChange={() => setChunkingStrategy('question')}
className="h-4 w-4 text-blue-600 border-gray-300 focus:ring-blue-500"
/>
<MessageSquare className="w-4 h-4 text-gray-500" />
<div>
<span className="text-sm font-medium text-gray-900">Chunk by Question</span>
<span className="text-xs text-gray-500 ml-1">Detect Q&A pairs, extract tables</span>
</div>
</label>
</div>
</div>
{hasEntries && ( {hasEntries && (
<div className="mt-4 space-y-2"> <div className="mt-4 space-y-2">
<div className="text-sm font-medium text-gray-600"> <div className="text-sm font-medium text-gray-600">

View File

@ -1,3 +1,5 @@
export type ChunkingStrategy = 'token' | 'question'
export interface SourceMetadata { export interface SourceMetadata {
filename: string filename: string
upload_date: string upload_date: string
@ -40,6 +42,7 @@ export interface IngestResponse {
document_id: string document_id: string
chunk_count: number chunk_count: number
filename: string filename: string
strategy: ChunkingStrategy
} }
export interface DocumentInfo { export interface DocumentInfo {
@ -47,6 +50,7 @@ export interface DocumentInfo {
filename: string filename: string
chunk_count: number chunk_count: number
upload_date: string upload_date: string
chunking_strategy: ChunkingStrategy
} }
export interface ChunkInfo { export interface ChunkInfo {
@ -55,6 +59,13 @@ export interface ChunkInfo {
content_summary: string content_summary: string
page_number: number | null page_number: number | null
chunk_file_path: string | null chunk_file_path: string | null
strategy_type: ChunkingStrategy
question_index: number | null
question_id: string | null
question_text: string | null
topic_section: string | null
source_page_range: number[] | null
has_table: boolean | null
} }
export interface DocumentListResponse { export interface DocumentListResponse {