Merge branch 'RAG-workflow'
This commit is contained in:
commit
f637ab10a5
|
|
@ -327,7 +327,7 @@ For each section in the JSON response:
|
|||
If `qa_vision_enabled=false` or for cost optimization, use regex as a fast pre-filter. The LLM call is skipped for documents where regex confidently identifies all boundaries, and used only when:
|
||||
- No regex pattern matches (unknown format)
|
||||
- Regex produces < 2 sections (likely misdetection)
|
||||
- `qa_verification_model` is not set to `"none"`
|
||||
- `qa_structure_model` is not set to `"none"`
|
||||
|
||||
### Algorithm Detail: Table-to-Markdown
|
||||
|
||||
|
|
@ -382,7 +382,7 @@ class Settings(BaseSettings):
|
|||
# NEW: Q&A chunking config
|
||||
qa_vision_enabled: bool = True # Toggle vision-based table extraction (uses existing LLM_MODEL_NAME)
|
||||
qa_max_chunk_tokens: int = 3000 # Max tokens per Q&A chunk (before forced split)
|
||||
qa_verification_model: str = "" # LLM for boundary verification (empty = use LLM_MODEL_NAME)
|
||||
qa_structure_model: str = "" # LLM for structure detection (empty = use LLM_MODEL_NAME)
|
||||
qa_include_internal_refs: bool = True # Include [內部參考] in chunks
|
||||
qa_cache_vision_results: bool = True # Cache vision results per page
|
||||
|
||||
|
|
@ -390,7 +390,7 @@ class Settings(BaseSettings):
|
|||
# DEFAULT_CHUNKING_STRATEGY=token
|
||||
# QA_VISION_ENABLED=true
|
||||
# QA_MAX_CHUNK_TOKENS=3000
|
||||
# QA_VERIFICATION_MODEL=
|
||||
# QA_STRUCTURE_MODEL=
|
||||
# QA_INCLUDE_INTERNAL_REFS=true
|
||||
# QA_CACHE_VISION_RESULTS=true
|
||||
|
||||
|
|
|
|||
|
|
@ -41,3 +41,11 @@ MAX_VIDEO_SIZE_MB=300
|
|||
# Set to false to disable System Audio or Listen Mic capture
|
||||
SYSTEM_AUDIO_ENABLED=true
|
||||
MIC_ENABLED=true
|
||||
|
||||
# Q&A-pair chunking (Package 8)
|
||||
DEFAULT_CHUNKING_STRATEGY=token
|
||||
QA_VISION_ENABLED=true
|
||||
QA_MAX_CHUNK_TOKENS=3000
|
||||
QA_STRUCTURE_MODEL=
|
||||
QA_INCLUDE_INTERNAL_REFS=true
|
||||
QA_CACHE_VISION_RESULTS=true
|
||||
|
|
|
|||
|
|
@ -44,6 +44,14 @@ class Settings(BaseSettings):
|
|||
relevance_threshold: float = 7.0
|
||||
llm_timeout: float = 60.0
|
||||
|
||||
# Q&A-pair chunking strategy (Package 8)
|
||||
default_chunking_strategy: str = "token"
|
||||
qa_vision_enabled: bool = True
|
||||
qa_max_chunk_tokens: int = 3000
|
||||
qa_structure_model: str = ""
|
||||
qa_include_internal_refs: bool = True
|
||||
qa_cache_vision_results: bool = True
|
||||
|
||||
# Alibaba Cloud DashScope ASR (Phase 2)
|
||||
dashscope_api_key: str = ""
|
||||
asr_model_name: str = "qwen3-asr-flash"
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ class DocumentInfo(BaseModel):
|
|||
filename: str
|
||||
chunk_count: int
|
||||
upload_date: str
|
||||
chunking_strategy: str = "token"
|
||||
|
||||
|
||||
class ChunkInfo(BaseModel):
|
||||
|
|
@ -16,6 +17,14 @@ class ChunkInfo(BaseModel):
|
|||
content_summary: str
|
||||
page_number: Optional[int] = None
|
||||
chunk_file_path: Optional[str] = None
|
||||
strategy_type: Optional[str] = None
|
||||
question_index: Optional[int] = None
|
||||
question_id: Optional[str] = None
|
||||
question_text: Optional[str] = None
|
||||
section_heading: Optional[str] = None
|
||||
answer_contains_table: Optional[bool] = None
|
||||
source_page_range: Optional[List[int]] = None
|
||||
parent_topic: Optional[str] = None
|
||||
|
||||
|
||||
class DocumentListResponse(BaseModel):
|
||||
|
|
|
|||
|
|
@ -1,7 +1,18 @@
|
|||
from typing import Literal
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
ChunkingStrategyType = Literal["token", "question"]
|
||||
|
||||
VALID_CHUNKING_STRATEGIES = frozenset({"token", "question"})
|
||||
|
||||
|
||||
class IngestRequest(BaseModel):
|
||||
strategy: ChunkingStrategyType = "token"
|
||||
|
||||
|
||||
class IngestResponse(BaseModel):
|
||||
document_id: str
|
||||
chunk_count: int
|
||||
filename: str
|
||||
strategy: ChunkingStrategyType = "token"
|
||||
|
|
|
|||
|
|
@ -5,9 +5,9 @@ import tempfile
|
|||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import APIRouter, UploadFile, File, HTTPException
|
||||
from fastapi import APIRouter, UploadFile, File, HTTPException, Query
|
||||
|
||||
from app.models.ingest import IngestResponse
|
||||
from app.models.ingest import IngestResponse, VALID_CHUNKING_STRATEGIES
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(tags=["ingest"])
|
||||
|
|
@ -37,11 +37,14 @@ def _delete_existing_document(rag, filename: str, chunk_dir: str) -> None:
|
|||
|
||||
|
||||
@router.post("/ingest", response_model=IngestResponse)
|
||||
async def ingest_document(file: UploadFile = File(...)):
|
||||
async def ingest_document(
|
||||
file: UploadFile = File(...),
|
||||
strategy: str = Query("token"),
|
||||
):
|
||||
"""Ingest a document into the RAG system."""
|
||||
from app.core.config import get_settings
|
||||
from app.services.rag import RAGService
|
||||
from app.utils.chunking import TokenChunkingStrategy
|
||||
from app.utils.chunking import get_chunking_strategy
|
||||
from app.utils.metadata import extract_metadata
|
||||
|
||||
filename = file.filename or "unknown"
|
||||
|
|
@ -53,6 +56,12 @@ async def ingest_document(file: UploadFile = File(...)):
|
|||
detail=f"Unsupported file format: {file_ext}. Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}",
|
||||
)
|
||||
|
||||
if strategy not in VALID_CHUNKING_STRATEGIES:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Invalid chunking strategy: {strategy}. Valid: {', '.join(sorted(VALID_CHUNKING_STRATEGIES))}",
|
||||
)
|
||||
|
||||
settings = get_settings()
|
||||
temp_path = None
|
||||
try:
|
||||
|
|
@ -68,9 +77,7 @@ async def ingest_document(file: UploadFile = File(...)):
|
|||
_delete_existing_document(rag, filename, chunk_dir)
|
||||
|
||||
document_id = str(uuid.uuid4())
|
||||
chunker = TokenChunkingStrategy(
|
||||
chunk_size=settings.chunk_size, overlap=settings.chunk_overlap
|
||||
)
|
||||
chunker = get_chunking_strategy(strategy, settings)
|
||||
|
||||
if file_ext == ".pdf":
|
||||
from app.utils.pdf_parser import parse_pdf_by_page
|
||||
|
|
@ -105,6 +112,8 @@ async def ingest_document(file: UploadFile = File(...)):
|
|||
)
|
||||
chunk_file_paths.append(None)
|
||||
|
||||
chunk_metadata = chunker._chunk_metadata if hasattr(chunker, '_chunk_metadata') else None
|
||||
|
||||
metadata = extract_metadata(
|
||||
temp_path,
|
||||
chunk_texts,
|
||||
|
|
@ -112,6 +121,8 @@ async def ingest_document(file: UploadFile = File(...)):
|
|||
page_numbers=page_numbers,
|
||||
chunk_file_paths=chunk_file_paths,
|
||||
document_id=document_id,
|
||||
strategy_type=strategy,
|
||||
chunk_metadata=chunk_metadata,
|
||||
)
|
||||
|
||||
rag.ingest_document(temp_path, chunk_texts, metadata, document_id=document_id)
|
||||
|
|
@ -145,9 +156,12 @@ async def ingest_document(file: UploadFile = File(...)):
|
|||
)
|
||||
chunk_file_paths.append(None)
|
||||
|
||||
chunk_metadata = chunker._chunk_metadata if hasattr(chunker, '_chunk_metadata') else None
|
||||
|
||||
metadata = extract_metadata(
|
||||
temp_path, chunks, original_filename=filename,
|
||||
chunk_file_paths=chunk_file_paths, document_id=document_id,
|
||||
strategy_type=strategy, chunk_metadata=chunk_metadata,
|
||||
)
|
||||
rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)
|
||||
|
||||
|
|
@ -180,9 +194,12 @@ async def ingest_document(file: UploadFile = File(...)):
|
|||
)
|
||||
chunk_file_paths.append(None)
|
||||
|
||||
chunk_metadata = chunker._chunk_metadata if hasattr(chunker, '_chunk_metadata') else None
|
||||
|
||||
metadata = extract_metadata(
|
||||
temp_path, chunks, original_filename=filename,
|
||||
chunk_file_paths=chunk_file_paths, document_id=document_id,
|
||||
strategy_type=strategy, chunk_metadata=chunk_metadata,
|
||||
)
|
||||
rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)
|
||||
|
||||
|
|
@ -193,6 +210,7 @@ async def ingest_document(file: UploadFile = File(...)):
|
|||
document_id=document_id,
|
||||
chunk_count=chunk_count,
|
||||
filename=filename,
|
||||
strategy=strategy,
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,60 @@
|
|||
"""Acceptance tests: Phase 8 Q&A-pair chunking with real LTT PDFs.
|
||||
|
||||
Prerequisites:
|
||||
- ChromaDB running (local)
|
||||
- .env configured with valid LLM_BASE_URL, LLM_API_KEY, LLM_MODEL_NAME
|
||||
- Test PDFs available in ../../test materials/LTT/
|
||||
|
||||
These tests require real LLM calls and actual LegCo PDFs.
|
||||
Run manually: pytest app/test/acceptance/test_acceptance_phase8_qa_chunking.py -v -m acceptance
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.acceptance
|
||||
@pytest.mark.slow
|
||||
class TestRealQaChunking:
|
||||
"""End-to-end Q&A chunking with real LegCo PDFs from test materials/LTT/."""
|
||||
|
||||
@pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
|
||||
def test_real_qa_chunking_fileE(self):
|
||||
"""File E produces 12 Chinese Q&A pairs + 3 Others + narrative sections."""
|
||||
pass
|
||||
|
||||
@pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
|
||||
def test_real_qa_chunking_fileL(self):
|
||||
"""File L produces 24 English Q&A pairs + narrative sections."""
|
||||
pass
|
||||
|
||||
@pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
|
||||
def test_real_qa_chunking_fileB(self):
|
||||
"""File B produces 3 Chinese Q&A pairs + narrative sections."""
|
||||
pass
|
||||
|
||||
@pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
|
||||
def test_real_qa_chunking_fileA(self):
|
||||
"""File A falls back to narrative chunking (no Q&A, should not error)."""
|
||||
pass
|
||||
|
||||
@pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
|
||||
def test_table_extraction_fileE(self):
|
||||
"""Tables in File E answers converted to markdown."""
|
||||
pass
|
||||
|
||||
@pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
|
||||
def test_table_extraction_fileL(self):
|
||||
"""Tables in File L answers converted to markdown."""
|
||||
pass
|
||||
|
||||
@pytest.mark.skip(reason="Requires real LLM API and test PDFs in test materials/LTT/")
|
||||
def test_qa_page_references(self):
|
||||
"""Each Q&A chunk's page number points to question (問) location."""
|
||||
pass
|
||||
|
||||
@pytest.mark.skip(reason="Requires full pipeline with LLM, embeddings, ChromaDB")
|
||||
def test_full_pipeline_question_strategy(self):
|
||||
"""Full ingest -> retrieve -> query pipeline with Q&A chunks."""
|
||||
pass
|
||||
|
|
@ -31,3 +31,47 @@ def test_config_default_values(monkeypatch):
|
|||
settings = Settings()
|
||||
assert settings.llm_base_url == "https://openrouter.ai/api/v1"
|
||||
assert settings.llm_model_name == "qwen/qwen3.5-35b-a3b"
|
||||
|
||||
|
||||
def test_qa_chunking_config_defaults(monkeypatch):
|
||||
"""Phase 8.0: Q&A chunking config fields have correct defaults."""
|
||||
monkeypatch.delenv("DEFAULT_CHUNKING_STRATEGY", raising=False)
|
||||
monkeypatch.delenv("QA_VISION_ENABLED", raising=False)
|
||||
monkeypatch.delenv("QA_MAX_CHUNK_TOKENS", raising=False)
|
||||
monkeypatch.delenv("QA_STRUCTURE_MODEL", raising=False)
|
||||
monkeypatch.delenv("QA_INCLUDE_INTERNAL_REFS", raising=False)
|
||||
monkeypatch.delenv("QA_CACHE_VISION_RESULTS", raising=False)
|
||||
|
||||
from app.core.config import Settings
|
||||
|
||||
settings = Settings()
|
||||
assert settings.default_chunking_strategy == "token"
|
||||
assert settings.qa_vision_enabled is True
|
||||
assert settings.qa_max_chunk_tokens == 3000
|
||||
assert settings.qa_structure_model == ""
|
||||
assert settings.qa_include_internal_refs is True
|
||||
assert settings.qa_cache_vision_results is True
|
||||
|
||||
|
||||
def test_qa_chunking_config_from_env(tmp_path, monkeypatch):
|
||||
"""Phase 8.0: Q&A chunking config fields load from .env."""
|
||||
env_file = tmp_path / ".env"
|
||||
env_file.write_text(
|
||||
"DEFAULT_CHUNKING_STRATEGY=question\n"
|
||||
"QA_VISION_ENABLED=false\n"
|
||||
"QA_MAX_CHUNK_TOKENS=5000\n"
|
||||
"QA_STRUCTURE_MODEL=anthropic/claude-3-haiku\n"
|
||||
"QA_INCLUDE_INTERNAL_REFS=false\n"
|
||||
"QA_CACHE_VISION_RESULTS=false\n"
|
||||
)
|
||||
|
||||
monkeypatch.chdir(tmp_path)
|
||||
from app.core.config import Settings
|
||||
|
||||
settings = Settings()
|
||||
assert settings.default_chunking_strategy == "question"
|
||||
assert settings.qa_vision_enabled is False
|
||||
assert settings.qa_max_chunk_tokens == 5000
|
||||
assert settings.qa_structure_model == "anthropic/claude-3-haiku"
|
||||
assert settings.qa_include_internal_refs is False
|
||||
assert settings.qa_cache_vision_results is False
|
||||
|
|
|
|||
|
|
@ -0,0 +1,209 @@
|
|||
"""Phase 8 tests: Ingest API integration with strategy selection (Sub-Phase 8.3).
|
||||
|
||||
Covers:
|
||||
- POST /api/v1/api/v1/ingest?strategy=token — existing behavior unchanged
|
||||
- POST /api/v1/api/v1/ingest?strategy=question — Q&A chunking applied
|
||||
- Invalid strategy values return 400
|
||||
- IngestResponse includes strategy field
|
||||
- DOCX with Q&A format uses question strategy
|
||||
- Document without Q&A falls back gracefully
|
||||
"""
|
||||
import io
|
||||
import json
|
||||
from typing import List, Tuple
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
from fastapi import FastAPI
|
||||
from fastapi.testclient import TestClient
|
||||
from pypdf import PdfWriter
|
||||
|
||||
from app.routers.ingest import router
|
||||
|
||||
|
||||
class _DeterministicEmbedding:
|
||||
def name(self) -> str:
|
||||
return "test_deterministic"
|
||||
|
||||
def __call__(self, input):
|
||||
return self._embed(input)
|
||||
|
||||
def embed_query(self, input):
|
||||
return self._embed(input)
|
||||
|
||||
@staticmethod
|
||||
def _embed(texts):
|
||||
vectors = []
|
||||
for text in texts:
|
||||
vec = [0.0] * 384
|
||||
for i, ch in enumerate(text[:384]):
|
||||
vec[i] = ord(ch) / 1000.0
|
||||
vectors.append(vec)
|
||||
return vectors
|
||||
|
||||
|
||||
def _create_real_pdf(content: str) -> bytes:
|
||||
writer = PdfWriter()
|
||||
writer.add_blank_page(width=200, height=200)
|
||||
buf = io.BytesIO()
|
||||
writer.write(buf)
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
def _create_text_txt(content: str) -> bytes:
|
||||
return content.encode("utf-8")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client(tmp_path, monkeypatch):
|
||||
"""TestClient with real ChromaDB isolated in tmp_path + deterministic embeddings."""
|
||||
chroma_path = str(tmp_path / "chroma_db")
|
||||
chunk_path = str(tmp_path / "document_chunk")
|
||||
prompts_path = str(tmp_path / "prompts.db")
|
||||
history_path = str(tmp_path / "history.db")
|
||||
|
||||
monkeypatch.setenv("CHROMA_DB_PATH", chroma_path)
|
||||
monkeypatch.setenv("DOCUMENT_CHUNK_PATH", chunk_path)
|
||||
monkeypatch.setenv("PROMPTS_DB_PATH", prompts_path)
|
||||
monkeypatch.setenv("HISTORY_DB_PATH", history_path)
|
||||
monkeypatch.setenv("EMBEDDING_MODEL", "test-mock")
|
||||
monkeypatch.setenv("LLM_API_KEY", "test-key")
|
||||
|
||||
from app.core.config import get_settings
|
||||
get_settings.cache_clear()
|
||||
from app.core.dependencies import get_settings_cached
|
||||
get_settings_cached.cache_clear()
|
||||
|
||||
from app.core.sqlite_db import _get_db, init_prompts_db, init_history_db, seed_default_profiles
|
||||
conn = _get_db(prompts_path)
|
||||
init_prompts_db(conn)
|
||||
seed_default_profiles(conn)
|
||||
conn.close()
|
||||
|
||||
hconn = _get_db(history_path)
|
||||
init_history_db(hconn)
|
||||
hconn.close()
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.core.database.get_embedding_function_settings",
|
||||
lambda settings: _DeterministicEmbedding(),
|
||||
)
|
||||
|
||||
test_app = FastAPI()
|
||||
test_app.include_router(router, prefix="/api/v1")
|
||||
|
||||
yield TestClient(test_app)
|
||||
|
||||
get_settings_cached.cache_clear()
|
||||
get_settings.cache_clear()
|
||||
|
||||
|
||||
def test_ingest_with_strategy_token(client):
|
||||
"""Existing behavior unchanged: strategy=token uses TokenChunkingStrategy."""
|
||||
txt_bytes = _create_text_txt("This is a test document with enough content to generate chunks.")
|
||||
resp = client.post(
|
||||
"/api/v1/ingest?strategy=token",
|
||||
files={"file": ("test.txt", txt_bytes, "text/plain")},
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert data["strategy"] == "token"
|
||||
assert data["chunk_count"] > 0
|
||||
|
||||
|
||||
def test_ingest_invalid_strategy_rejected(client):
|
||||
"""Invalid strategy values return 400."""
|
||||
txt_bytes = _create_text_txt("test")
|
||||
resp = client.post(
|
||||
"/api/v1/ingest?strategy=invalid",
|
||||
files={"file": ("test.txt", txt_bytes, "text/plain")},
|
||||
)
|
||||
assert resp.status_code == 400
|
||||
assert "strategy" in resp.json()["detail"].lower()
|
||||
|
||||
|
||||
def test_ingest_response_includes_strategy(client):
|
||||
"""IngestResponse includes the strategy field."""
|
||||
txt_bytes = _create_text_txt("Strategy response test content with more text to ensure chunks.")
|
||||
resp = client.post(
|
||||
"/api/v1/ingest?strategy=token",
|
||||
files={"file": ("test.txt", txt_bytes, "text/plain")},
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
assert "strategy" in resp.json()
|
||||
|
||||
|
||||
def test_ingest_default_strategy_is_token(client):
|
||||
"""When no strategy param provided, default to token."""
|
||||
txt_bytes = _create_text_txt("Default strategy test with enough text to generate output.")
|
||||
resp = client.post(
|
||||
"/api/v1/ingest",
|
||||
files={"file": ("test.txt", txt_bytes, "text/plain")},
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["strategy"] == "token"
|
||||
|
||||
|
||||
def test_ingest_question_strategy_txt(client, monkeypatch):
|
||||
"""TXT with Q&A format uses question strategy and produces chunks."""
|
||||
_mock_question_chunker(monkeypatch)
|
||||
|
||||
txt_bytes = _create_text_txt("問A1:test question\n答A1:test answer with more text here to ensure chunking works properly.")
|
||||
|
||||
resp = client.post(
|
||||
"/api/v1/ingest?strategy=question",
|
||||
files={"file": ("test.txt", txt_bytes, "text/plain")},
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert data["strategy"] == "question"
|
||||
assert data["chunk_count"] > 0
|
||||
|
||||
|
||||
def test_ingest_question_strategy_no_qa_fallback(client, monkeypatch):
|
||||
"""Document without Q&A markers falls back to narrative chunking without error."""
|
||||
_mock_question_chunker(monkeypatch)
|
||||
|
||||
txt_bytes = _create_text_txt("This is plain text without any Q&A markers, but it needs to be long enough to generate at least one chunk when processed by the tokenizer.")
|
||||
|
||||
resp = client.post(
|
||||
"/api/v1/ingest?strategy=question",
|
||||
files={"file": ("plain.txt", txt_bytes, "text/plain")},
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert data["strategy"] == "question"
|
||||
assert data["chunk_count"] > 0
|
||||
|
||||
|
||||
def _mock_question_chunker(monkeypatch):
|
||||
"""Replace QuestionChunkingStrategy with a mock that returns test chunks."""
|
||||
|
||||
class _MockQuestionChunker:
|
||||
def __init__(self, settings=None, llm_client=None):
|
||||
self._chunk_metadata = [
|
||||
{
|
||||
"strategy_type": "question",
|
||||
"section_type": "qa",
|
||||
"question_index": 0,
|
||||
"question_id": "A1",
|
||||
"question_text": "What is X?",
|
||||
"section_heading": "(A) Topic",
|
||||
"answer_contains_table": False,
|
||||
"source_page_range": [1, 2],
|
||||
}
|
||||
]
|
||||
self._max_tokens = 3000
|
||||
|
||||
def chunk(self, text):
|
||||
self._chunk_metadata = self._chunk_metadata[:1]
|
||||
return ["Question: What is X?\n\nAnswer: X is Y."]
|
||||
|
||||
def chunk_pages(self, pages, overlap_tokens=0):
|
||||
self._chunk_metadata = self._chunk_metadata[:1]
|
||||
return [("Question: What is X?\n\nAnswer: X is Y.", 1)]
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.utils.chunking.QuestionChunkingStrategy",
|
||||
_MockQuestionChunker,
|
||||
)
|
||||
|
|
@ -0,0 +1,149 @@
|
|||
"""Phase 8 tests: Q&A metadata enrichment (Sub-Phase 8.2).
|
||||
|
||||
Covers:
|
||||
- Metadata enrichment with Q&A-specific fields via chunk_metadata param
|
||||
- Backward compatibility: token strategy unchanged
|
||||
- Page number references question location
|
||||
- Chunk metadata merging with base metadata
|
||||
"""
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from app.utils.metadata import extract_metadata
|
||||
|
||||
|
||||
def test_qa_metadata_fields(tmp_path):
|
||||
"""strategy_type, question_index, question_id, question_text merged via chunk_metadata."""
|
||||
file_path = tmp_path / "test.pdf"
|
||||
file_path.write_text("dummy content")
|
||||
|
||||
chunks = ["chunk 1", "chunk 2"]
|
||||
chunk_metadata = [
|
||||
{
|
||||
"strategy_type": "question",
|
||||
"section_type": "qa",
|
||||
"question_index": 0,
|
||||
"question_id": "A1",
|
||||
"question_text": "What is X?",
|
||||
"section_heading": "(A) Section",
|
||||
"answer_contains_table": True,
|
||||
"source_page_range": [2, 5],
|
||||
"parent_topic": "Topic Name",
|
||||
},
|
||||
{
|
||||
"strategy_type": "question",
|
||||
"section_type": "qa",
|
||||
"question_index": 1,
|
||||
"question_id": "A2",
|
||||
"question_text": "What is Y?",
|
||||
"section_heading": "(A) Section",
|
||||
"answer_contains_table": False,
|
||||
"source_page_range": [5, 7],
|
||||
},
|
||||
]
|
||||
|
||||
metadata = extract_metadata(
|
||||
file_path=str(file_path),
|
||||
chunks=chunks,
|
||||
strategy_type="question",
|
||||
chunk_metadata=chunk_metadata,
|
||||
)
|
||||
assert len(metadata) == 2
|
||||
|
||||
m0 = metadata[0]
|
||||
assert m0["strategy_type"] == "question"
|
||||
assert m0["section_type"] == "qa"
|
||||
assert m0["question_index"] == 0
|
||||
assert m0["question_id"] == "A1"
|
||||
assert m0["question_text"] == "What is X?"
|
||||
assert m0["section_heading"] == "(A) Section"
|
||||
assert m0["answer_contains_table"] is True
|
||||
assert m0["source_page_range"] == [2, 5]
|
||||
assert m0["parent_topic"] == "Topic Name"
|
||||
|
||||
m1 = metadata[1]
|
||||
assert m1["question_index"] == 1
|
||||
assert m1["question_id"] == "A2"
|
||||
assert m1["answer_contains_table"] is False
|
||||
|
||||
|
||||
def test_qa_metadata_topic_section(tmp_path):
|
||||
"""section_heading and parent_topic are both preserved."""
|
||||
file_path = tmp_path / "test.pdf"
|
||||
file_path.write_text("dummy content")
|
||||
|
||||
metadata = extract_metadata(
|
||||
file_path=str(file_path),
|
||||
chunks=["chunk"],
|
||||
strategy_type="question",
|
||||
chunk_metadata=[{"section_heading": "(B) Traffic", "parent_topic": "Traffic Planning"}],
|
||||
)
|
||||
assert metadata[0]["section_heading"] == "(B) Traffic"
|
||||
assert metadata[0]["parent_topic"] == "Traffic Planning"
|
||||
|
||||
|
||||
def test_token_metadata_unchanged(tmp_path):
|
||||
"""Existing metadata fields unchanged for token strategy (no chunk_metadata)."""
|
||||
file_path = tmp_path / "test.txt"
|
||||
file_path.write_text("test content")
|
||||
|
||||
metadata = extract_metadata(
|
||||
file_path=str(file_path),
|
||||
chunks=["chunk 1", "chunk 2"],
|
||||
original_filename="original.txt",
|
||||
strategy_type="token",
|
||||
)
|
||||
assert len(metadata) == 2
|
||||
for m in metadata:
|
||||
assert "filename" in m
|
||||
assert "upload_date" in m
|
||||
assert "content_summary" in m
|
||||
assert "chunk_index" in m
|
||||
assert m.get("strategy_type", "token") == "token"
|
||||
assert "question_id" not in m
|
||||
|
||||
|
||||
def test_page_number_from_question(tmp_path):
|
||||
"""Page ref should point to question location (pass via page_numbers from strategy)."""
|
||||
file_path = tmp_path / "test.pdf"
|
||||
file_path.write_text("dummy content")
|
||||
|
||||
metadata = extract_metadata(
|
||||
file_path=str(file_path),
|
||||
chunks=["question chunk"],
|
||||
page_numbers=[3],
|
||||
strategy_type="question",
|
||||
chunk_metadata=[{
|
||||
"question_id": "A1",
|
||||
"source_page_range": [3, 8],
|
||||
}],
|
||||
)
|
||||
assert metadata[0]["page_number"] == 3
|
||||
assert metadata[0]["source_page_range"] == [3, 8]
|
||||
|
||||
|
||||
def test_chunk_metadata_length_mismatch(tmp_path):
|
||||
"""chunk_metadata length mismatch with chunks raises ValueError."""
|
||||
file_path = tmp_path / "test.pdf"
|
||||
file_path.write_text("dummy content")
|
||||
|
||||
with pytest.raises(ValueError, match="chunk_metadata length"):
|
||||
extract_metadata(
|
||||
file_path=str(file_path),
|
||||
chunks=["a", "b", "c"],
|
||||
chunk_metadata=[{}, {}],
|
||||
)
|
||||
|
||||
|
||||
def test_chunk_metadata_empty_no_error(tmp_path):
|
||||
"""Empty chunk_metadata list with matching chunks is valid."""
|
||||
file_path = tmp_path / "test.pdf"
|
||||
file_path.write_text("dummy content")
|
||||
|
||||
metadata = extract_metadata(
|
||||
file_path=str(file_path),
|
||||
chunks=["a"],
|
||||
chunk_metadata=[],
|
||||
)
|
||||
assert len(metadata) == 1
|
||||
|
|
@ -0,0 +1,481 @@
|
|||
"""Phase 8 tests: Q&A-pair chunking strategy (Sub-Phase 8.1).
|
||||
|
||||
Covers:
|
||||
- LLM structure detection response parsing (parse_llm_structure_response)
|
||||
- Mixed format handling (問/答 + section headings)
|
||||
- Narrative-only text (no Q&A format)
|
||||
- Speaking notes (發言要點) chunking by bullet
|
||||
- Regex fast-pass for Chinese 問/答 format
|
||||
- Regex fast-pass for English Q1/Q2 format
|
||||
- Multi-page section tracking with [PAGE_BREAK] markers
|
||||
- ChunkingStrategy ABC compliance
|
||||
- Page number references question (問) page, not answer
|
||||
- Size limit: oversized sections recursively split with heading preserved
|
||||
- build_chunks_from_sections output verification
|
||||
- preprocess_text: footer stripping, colon normalization, page break insertion
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import List, Tuple
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from app.utils.qa_chunking import (
|
||||
Section,
|
||||
preprocess_text,
|
||||
build_structure_detection_prompt,
|
||||
parse_llm_structure_response,
|
||||
split_chinese_qa,
|
||||
split_english_qa,
|
||||
build_chunks_from_sections,
|
||||
)
|
||||
from app.utils.chunking import (
|
||||
ChunkingStrategy,
|
||||
QuestionChunkingStrategy,
|
||||
get_chunking_strategy,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@pytest.fixture
|
||||
def mock_settings():
|
||||
"""Minimal Settings mock with Q&A chunking defaults."""
|
||||
s = MagicMock()
|
||||
s.default_chunking_strategy = "question"
|
||||
s.qa_vision_enabled = False
|
||||
s.qa_max_chunk_tokens = 3000
|
||||
s.qa_structure_model = ""
|
||||
s.qa_include_internal_refs = True
|
||||
s.qa_cache_vision_results = True
|
||||
s.chunk_size = 1000
|
||||
s.chunk_overlap = 200
|
||||
s.llm_model_name = "test-model"
|
||||
s.llm_api_key = "test-key"
|
||||
s.llm_base_url = "https://example.com/v1"
|
||||
s.llm_timeout = 30.0
|
||||
s.llm_enable_thinking = False
|
||||
s.vllm_engine = False
|
||||
return s
|
||||
|
||||
|
||||
SAMPLE_LLM_RESPONSE = json.dumps({
|
||||
"sections": [
|
||||
{
|
||||
"type": "qa",
|
||||
"heading": "(A) 排水系統",
|
||||
"qa_id": "A1",
|
||||
"question": "古洞北和粉嶺北新發展區的設計是否能抵禦氣候變化?",
|
||||
"answer": "研究顧問已為古洞北和粉嶺北新發展區研究範圍進行了評估。",
|
||||
"start_page": 2,
|
||||
"end_page": 3,
|
||||
"has_table": False,
|
||||
"parent_topic": "排水系統",
|
||||
},
|
||||
{
|
||||
"type": "narrative",
|
||||
"heading": "(1) 住戶的安置補償",
|
||||
"content": "合資格住戶可選擇(i)「須通過經濟狀況審查」安置安排。",
|
||||
"start_page": 2,
|
||||
"end_page": 5,
|
||||
"has_table": False,
|
||||
},
|
||||
{
|
||||
"type": "speaking_notes",
|
||||
"heading": "發言要點",
|
||||
"content": "⚫ 古洞北/粉嶺北新發展區是北部都會區內的新市鎮發展項目\n⚫ 第一期發展預計於2030年完成",
|
||||
"start_page": 1,
|
||||
"end_page": 2,
|
||||
"has_table": False,
|
||||
},
|
||||
]
|
||||
})
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: LLM structure detection parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestLLMStructureDetection:
|
||||
|
||||
def test_llm_structure_detection(self):
|
||||
"""parse_llm_structure_response correctly parses LLM JSON with qa/narrative/speaking_notes."""
|
||||
sections = parse_llm_structure_response(SAMPLE_LLM_RESPONSE)
|
||||
assert len(sections) == 3
|
||||
|
||||
qa = sections[0]
|
||||
assert qa.type == "qa"
|
||||
assert qa.qa_id == "A1"
|
||||
assert qa.question == "古洞北和粉嶺北新發展區的設計是否能抵禦氣候變化?"
|
||||
assert qa.answer == "研究顧問已為古洞北和粉嶺北新發展區研究範圍進行了評估。"
|
||||
assert qa.start_page == 2
|
||||
assert qa.end_page == 3
|
||||
assert qa.heading == "(A) 排水系統"
|
||||
assert qa.parent_topic == "排水系統"
|
||||
|
||||
narr = sections[1]
|
||||
assert narr.type == "narrative"
|
||||
assert narr.heading == "(1) 住戶的安置補償"
|
||||
assert "合資格住戶" in narr.content
|
||||
|
||||
notes = sections[2]
|
||||
assert notes.type == "speaking_notes"
|
||||
assert "⚫" in notes.content
|
||||
|
||||
def test_llm_handles_mixed_formats(self):
|
||||
"""Document with 問/答 markers + section headings correctly classified."""
|
||||
mixed_json = json.dumps({
|
||||
"sections": [
|
||||
{
|
||||
"type": "qa",
|
||||
"heading": "(B) 交通",
|
||||
"qa_id": "B1",
|
||||
"question": "新建道路何時通車?",
|
||||
"answer": "預計2027年通車。",
|
||||
"start_page": 3,
|
||||
"end_page": 4,
|
||||
"has_table": False,
|
||||
},
|
||||
{
|
||||
"type": "narrative",
|
||||
"heading": "背景",
|
||||
"content": "本文件說明交通規劃。",
|
||||
"start_page": 1,
|
||||
"end_page": 2,
|
||||
"has_table": False,
|
||||
},
|
||||
]
|
||||
})
|
||||
sections = parse_llm_structure_response(mixed_json)
|
||||
assert len(sections) == 2
|
||||
assert sections[0].type == "qa"
|
||||
assert sections[1].type == "narrative"
|
||||
|
||||
def test_llm_handles_no_qa_format(self):
|
||||
"""Narrative-only text (like File L pages 1-13) produces only narrative sections."""
|
||||
narrative_json = json.dumps({
|
||||
"sections": [
|
||||
{
|
||||
"type": "narrative",
|
||||
"heading": "Introduction",
|
||||
"content": "This document provides background on policy matters.",
|
||||
"start_page": 1,
|
||||
"end_page": 5,
|
||||
"has_table": False,
|
||||
},
|
||||
{
|
||||
"type": "narrative",
|
||||
"heading": "Analysis",
|
||||
"content": "The analysis covers multiple dimensions.",
|
||||
"start_page": 5,
|
||||
"end_page": 13,
|
||||
"has_table": False,
|
||||
},
|
||||
]
|
||||
})
|
||||
sections = parse_llm_structure_response(narrative_json)
|
||||
assert len(sections) == 2
|
||||
assert all(s.type == "narrative" for s in sections)
|
||||
|
||||
def test_llm_handles_speaking_notes(self):
|
||||
"""發言要點 text with bullet points produces speaking_notes sections."""
|
||||
notes_json = json.dumps({
|
||||
"sections": [
|
||||
{
|
||||
"type": "speaking_notes",
|
||||
"heading": "發言要點",
|
||||
"content": "⚫ 要點一:政策方向\n⚫ 要點二:實施計劃\n⚫ 要點三:預算安排",
|
||||
"start_page": 1,
|
||||
"end_page": 2,
|
||||
"has_table": False,
|
||||
},
|
||||
]
|
||||
})
|
||||
sections = parse_llm_structure_response(notes_json)
|
||||
assert len(sections) == 1
|
||||
assert sections[0].type == "speaking_notes"
|
||||
assert sections[0].content.count("⚫") == 3
|
||||
|
||||
def test_parse_markdown_fenced_json(self):
|
||||
"""parse_llm_structure_response handles ```json ... ``` wrapped responses."""
|
||||
fenced = '```json\n' + SAMPLE_LLM_RESPONSE + '\n```'
|
||||
sections = parse_llm_structure_response(fenced)
|
||||
assert len(sections) == 3
|
||||
|
||||
def test_parse_invalid_json_raises(self):
|
||||
"""parse_llm_structure_response raises ValueError on non-JSON input."""
|
||||
with pytest.raises(ValueError, match="Invalid JSON"):
|
||||
parse_llm_structure_response("this is not json")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: Regex fast-pass
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestRegexFastPass:
|
||||
|
||||
def test_regex_fastpass_chinese(self):
|
||||
"""Text with 問B1/答B1 markers detected by split_chinese_qa without LLM."""
|
||||
text = (
|
||||
"(A) 排水系統\n"
|
||||
"問 B1:古洞北的設計是否能抵禦氣候變化?\n"
|
||||
"答 B1:研究顧問已為古洞北新發展區進行了評估。\n"
|
||||
"問 B2:第二個問題是什麼?\n"
|
||||
"答 B2:這是第二個問題的答案。\n"
|
||||
)
|
||||
sections = split_chinese_qa(text)
|
||||
assert len(sections) >= 2
|
||||
# All should be QA type
|
||||
assert all(s.type == "qa" for s in sections)
|
||||
# First should have question containing 古洞北
|
||||
assert "古洞北" in sections[0].question
|
||||
|
||||
def test_regex_fastpass_chinese_no_match(self):
|
||||
"""split_chinese_qa returns empty list when no markers found."""
|
||||
text = "This is plain text without any Q&A markers."
|
||||
assert split_chinese_qa(text) == []
|
||||
|
||||
def test_regex_fastpass_english(self):
|
||||
"""Text with Q1, Q2 markers detected by split_english_qa without LLM."""
|
||||
text = (
|
||||
"Background information here.\n\n"
|
||||
"Q1 What is the timeline for the project?\n"
|
||||
"The project is expected to complete by 2027.\n"
|
||||
"Q2 How much will it cost?\n"
|
||||
"The estimated cost is HK$500 million.\n"
|
||||
)
|
||||
sections = split_english_qa(text)
|
||||
assert len(sections) >= 2
|
||||
assert all(s.type == "qa" for s in sections)
|
||||
assert any("timeline" in (s.question or "").lower() for s in sections)
|
||||
|
||||
def test_regex_fastpass_english_no_match(self):
|
||||
"""split_english_qa returns empty list when no markers found."""
|
||||
text = "純中文文本沒有英文問答標記。"
|
||||
assert split_english_qa(text) == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: Multi-page tracking
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestMultiPage:
|
||||
|
||||
def test_multi_page_sections(self):
|
||||
"""Sections with [PAGE_BREAK: N] markers spanning pages track correctly."""
|
||||
pages = [
|
||||
(1, "Header line\n(A) Water drainage\nSome intro text"),
|
||||
(2, "More drainage info\nFooter text X-1"),
|
||||
(3, "New section begins\n(B) Traffic planning"),
|
||||
]
|
||||
text = preprocess_text(pages)
|
||||
# Should have page break markers
|
||||
assert "[PAGE_BREAK: 1]" in text
|
||||
assert "[PAGE_BREAK: 2]" in text
|
||||
assert "[PAGE_BREAK: 3]" in text
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: ABC contract
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestABCContract:
|
||||
|
||||
def test_abc_contract(self):
|
||||
"""QuestionChunkingStrategy satisfies ChunkingStrategy ABC."""
|
||||
mock_settings = MagicMock()
|
||||
mock_settings.qa_max_chunk_tokens = 3000
|
||||
mock_settings.qa_include_internal_refs = True
|
||||
strategy = QuestionChunkingStrategy(settings=mock_settings)
|
||||
assert isinstance(strategy, ChunkingStrategy)
|
||||
|
||||
def test_get_chunking_strategy_factory(self, mock_settings):
|
||||
"""get_chunking_strategy returns correct strategy type."""
|
||||
token_strat = get_chunking_strategy("token", mock_settings)
|
||||
assert isinstance(token_strat, ChunkingStrategy)
|
||||
|
||||
q_strat = get_chunking_strategy("question", mock_settings)
|
||||
assert isinstance(q_strat, QuestionChunkingStrategy)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: Page number reference
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestPageNumberReference:
|
||||
|
||||
def test_page_number_reference_question(self):
|
||||
"""Page ref in metadata points to question (問) page, not answer page."""
|
||||
sections = [
|
||||
Section(
|
||||
type="qa",
|
||||
heading="(A) Topic",
|
||||
qa_id="A1",
|
||||
question="What is X?",
|
||||
answer="X is Y.",
|
||||
start_page=5,
|
||||
end_page=8,
|
||||
),
|
||||
]
|
||||
chunks = build_chunks_from_sections(sections)
|
||||
assert len(chunks) == 1
|
||||
chunk_text, page_num, metadata = chunks[0]
|
||||
# Page number should be start_page (question location)
|
||||
assert page_num == 5
|
||||
assert metadata.get("source_page_range") == [5, 8]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: Size limit recursive split
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestSizeLimit:
|
||||
|
||||
def test_size_limit(self):
|
||||
"""Oversized QA section > 3000 tokens gets recursively split with question prepended."""
|
||||
# Create a QA pair with a very long answer
|
||||
long_answer = "\n\n".join(f"Paragraph {i}: " + "x" * 200 for i in range(80))
|
||||
sections = [
|
||||
Section(
|
||||
type="qa",
|
||||
heading="(A) Topic",
|
||||
qa_id="A1",
|
||||
question="What is the detailed plan?",
|
||||
answer=long_answer,
|
||||
start_page=2,
|
||||
end_page=5,
|
||||
has_table=False,
|
||||
),
|
||||
]
|
||||
# Use a small max_tokens to force splitting
|
||||
chunks = build_chunks_from_sections(sections, max_tokens=500)
|
||||
assert len(chunks) > 1
|
||||
# Each chunk should have the question text prepended
|
||||
for chunk_text, page_num, metadata in chunks:
|
||||
assert "What is the detailed plan?" in chunk_text
|
||||
# Page number should always be the question page
|
||||
assert page_num == 2
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: build_chunks_from_sections
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestBuildChunksFromSections:
|
||||
|
||||
def test_build_chunks_from_sections(self):
|
||||
"""Verify chunk texts and metadata from sections list."""
|
||||
sections = [
|
||||
Section(
|
||||
type="qa",
|
||||
heading="(A) 排水系統",
|
||||
qa_id="A1",
|
||||
question="古洞北的設計是否能抵禦氣候變化?",
|
||||
answer="研究顧問已為古洞北進行了評估。",
|
||||
start_page=2,
|
||||
end_page=3,
|
||||
has_table=True,
|
||||
parent_topic="排水系統",
|
||||
),
|
||||
Section(
|
||||
type="narrative",
|
||||
heading="(1) 住戶的安置補償",
|
||||
content="合資格住戶可選擇安置安排。",
|
||||
start_page=3,
|
||||
end_page=5,
|
||||
has_table=False,
|
||||
),
|
||||
Section(
|
||||
type="speaking_notes",
|
||||
heading="發言要點",
|
||||
content="⚫ 要點一:政策方向\n⚫ 要點二:實施計劃",
|
||||
start_page=1,
|
||||
end_page=1,
|
||||
has_table=False,
|
||||
),
|
||||
Section(
|
||||
type="toc",
|
||||
heading="目錄",
|
||||
content="Page 1 ... Page 2",
|
||||
start_page=1,
|
||||
end_page=1,
|
||||
has_table=False,
|
||||
),
|
||||
]
|
||||
chunks = build_chunks_from_sections(sections)
|
||||
# Should have: 1 QA + 1 narrative + 2 speaking_notes bullets + 0 toc = 4
|
||||
assert len(chunks) >= 4
|
||||
|
||||
# First chunk: QA
|
||||
qa_text, qa_page, qa_meta = chunks[0]
|
||||
assert "古洞北" in qa_text
|
||||
assert qa_page == 2
|
||||
assert qa_meta["section_type"] == "qa"
|
||||
assert qa_meta["question_id"] == "A1"
|
||||
assert qa_meta["question_index"] == 0
|
||||
assert qa_meta["answer_contains_table"] is True
|
||||
assert qa_meta["section_heading"] == "(A) 排水系統"
|
||||
|
||||
# Find the narrative chunk
|
||||
narr_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "narrative"]
|
||||
assert len(narr_chunks) == 1
|
||||
narr_text, narr_page, narr_meta = narr_chunks[0]
|
||||
assert "住戶的安置補償" in narr_text
|
||||
assert "合資格住戶" in narr_text
|
||||
|
||||
# Find speaking_notes chunks
|
||||
notes_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "speaking_notes"]
|
||||
assert len(notes_chunks) == 2
|
||||
for t, p, m in notes_chunks:
|
||||
assert "要點" in t
|
||||
|
||||
# No TOC chunks
|
||||
toc_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "toc"]
|
||||
assert len(toc_chunks) == 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: preprocess_text
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestPreprocessText:
|
||||
|
||||
def test_preprocess_text(self):
|
||||
"""Footer markers stripped, colons normalized, page breaks inserted."""
|
||||
pages = [
|
||||
(1, "Header\n(A) Section Title\nX-1\n2024-01-15"),
|
||||
(2, "Content with:fullwidth colon\nMore text:here"),
|
||||
]
|
||||
result = preprocess_text(pages)
|
||||
|
||||
# Should have page break markers
|
||||
assert "[PAGE_BREAK: 1]" in result
|
||||
assert "[PAGE_BREAK: 2]" in result
|
||||
|
||||
# Fullwidth colons normalized to ASCII
|
||||
assert ":" not in result
|
||||
assert ":" in result
|
||||
|
||||
# Page footer patterns should be stripped (X-1, dates like 2024-01-15)
|
||||
assert "X-1" not in result
|
||||
assert "2024-01-15" not in result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: build_structure_detection_prompt
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestBuildPrompt:
|
||||
|
||||
def test_build_structure_detection_prompt(self):
|
||||
"""Prompt contains key instructions for LLM classification."""
|
||||
text = "Sample document text [PAGE_BREAK: 1]"
|
||||
prompt = build_structure_detection_prompt(text)
|
||||
assert "Hong Kong Legislative Council" in prompt or "Legislative Council" in prompt
|
||||
assert "qa" in prompt.lower() or "問" in prompt
|
||||
assert "narrative" in prompt.lower()
|
||||
assert "speaking_notes" in prompt.lower() or "speaking notes" in prompt.lower()
|
||||
assert text in prompt
|
||||
|
|
@ -6,8 +6,15 @@ token-based windows.
|
|||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Tuple
|
||||
from typing import TYPE_CHECKING, List, Optional, Tuple
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from app.core.config import Settings
|
||||
from app.services.llm_client import LLMClient
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ChunkingStrategy(ABC):
|
||||
|
|
@ -117,3 +124,111 @@ class TokenChunkingStrategy(ChunkingStrategy):
|
|||
results.append(("\n".join(parts), page_num))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
class QuestionChunkingStrategy(ChunkingStrategy):
|
||||
"""Chunk text by detecting Q&A structure using LLM and/or regex patterns.
|
||||
|
||||
Designed for LegCo documents with explicit 問/答 or Q1/Q2 markers.
|
||||
Falls back to section-based chunking for narrative-only documents.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
settings: "Settings",
|
||||
llm_client: Optional["LLMClient"] = None,
|
||||
):
|
||||
self._settings = settings
|
||||
self._llm_client = llm_client
|
||||
self._max_tokens = getattr(settings, "qa_max_chunk_tokens", 3000)
|
||||
self._chunk_metadata: List[dict] = []
|
||||
|
||||
def chunk(self, text: str) -> List[str]:
|
||||
"""Split text into chunks using Q&A detection (for DOCX/TXT)."""
|
||||
if not text or not text.strip():
|
||||
return []
|
||||
|
||||
from app.utils.qa_chunking import (
|
||||
split_chinese_qa,
|
||||
split_english_qa,
|
||||
build_chunks_from_sections,
|
||||
Section,
|
||||
)
|
||||
|
||||
sections = split_chinese_qa(text)
|
||||
if not sections:
|
||||
sections = split_english_qa(text)
|
||||
|
||||
if not sections:
|
||||
sections = [Section(type="narrative", content=text, start_page=1, end_page=1)]
|
||||
|
||||
results = build_chunks_from_sections(sections, max_tokens=self._max_tokens)
|
||||
self._chunk_metadata = [meta for _, _, meta in results]
|
||||
return [chunk_text for chunk_text, _, _ in results]
|
||||
|
||||
def chunk_pages(
|
||||
self, pages: List[Tuple[int, str]], overlap_tokens: int = 0
|
||||
) -> List[Tuple[str, int]]:
|
||||
"""Split page-segmented text using Q&A detection (for PDF).
|
||||
|
||||
Returns list of (chunk_text, page_number) where page_number
|
||||
references the question location for Q&A chunks.
|
||||
"""
|
||||
if not pages:
|
||||
return []
|
||||
|
||||
from app.utils.qa_chunking import (
|
||||
preprocess_text,
|
||||
split_chinese_qa,
|
||||
split_english_qa,
|
||||
build_chunks_from_sections,
|
||||
parse_llm_structure_response,
|
||||
build_structure_detection_prompt,
|
||||
Section,
|
||||
)
|
||||
|
||||
full_text = preprocess_text(pages)
|
||||
|
||||
sections = split_chinese_qa(full_text)
|
||||
if not sections:
|
||||
sections = split_english_qa(full_text)
|
||||
|
||||
if not sections and self._llm_client is not None:
|
||||
import asyncio
|
||||
prompt = build_structure_detection_prompt(full_text)
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
if loop.is_running():
|
||||
sections = []
|
||||
else:
|
||||
response = loop.run_until_complete(
|
||||
self._llm_client.complete(prompt, temperature=0.3, step_name="StructureDetection")
|
||||
)
|
||||
sections = parse_llm_structure_response(response)
|
||||
except Exception:
|
||||
logger.warning("LLM structure detection failed, using fallback", exc_info=True)
|
||||
|
||||
if not sections:
|
||||
sections = [Section(type="narrative", content=full_text, start_page=1, end_page=len(pages))]
|
||||
|
||||
results = build_chunks_from_sections(sections, max_tokens=self._max_tokens)
|
||||
self._chunk_metadata = [meta for _, _, meta in results]
|
||||
return [(chunk_text, page_num) for chunk_text, page_num, _ in results]
|
||||
|
||||
|
||||
def get_chunking_strategy(name: str, settings: "Settings") -> ChunkingStrategy:
|
||||
"""Factory: return the named chunking strategy.
|
||||
|
||||
Args:
|
||||
name: "token" or "question"
|
||||
settings: Application settings instance.
|
||||
|
||||
Returns:
|
||||
ChunkingStrategy instance.
|
||||
"""
|
||||
if name == "question":
|
||||
return QuestionChunkingStrategy(settings=settings)
|
||||
return TokenChunkingStrategy(
|
||||
chunk_size=settings.chunk_size,
|
||||
overlap=settings.chunk_overlap,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -12,6 +12,8 @@ def extract_metadata(
|
|||
page_numbers: List[int | None] | None = None,
|
||||
chunk_file_paths: List[str | None] | None = None,
|
||||
document_id: str | None = None,
|
||||
strategy_type: str = "token",
|
||||
chunk_metadata: List[Dict[str, Any]] | None = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Extract metadata for a list of text chunks.
|
||||
|
||||
|
|
@ -23,6 +25,10 @@ def extract_metadata(
|
|||
- chunk_file_path: path to the per-chunk source file
|
||||
- document_id: unique identifier linking all chunks to the same document
|
||||
|
||||
Package 8 Q&A fields (present when chunk_metadata provided):
|
||||
- strategy_type, section_type, question_index, question_id, question_text,
|
||||
section_heading, answer_contains_table, source_page_range, parent_topic
|
||||
|
||||
Args:
|
||||
file_path: Path to the file associated with the chunks.
|
||||
chunks: List of string chunks to generate metadata for.
|
||||
|
|
@ -31,6 +37,12 @@ def extract_metadata(
|
|||
page_numbers: Optional per-chunk page numbers. Length must match chunks.
|
||||
chunk_file_paths: Optional per-chunk source file paths. Length must match chunks.
|
||||
document_id: Optional unique document identifier applied to all chunks.
|
||||
strategy_type: Chunking strategy used ("token" or "question"). Stored in
|
||||
each chunk's metadata.
|
||||
chunk_metadata: Optional per-chunk metadata dicts from Q&A strategy.
|
||||
Each dict is merged into the corresponding base metadata entry.
|
||||
Length must match chunks. Fields like question_id, question_index,
|
||||
section_type, etc. are forwarded to ChromaDB metadata.
|
||||
|
||||
Returns:
|
||||
A list of metadata dictionaries, one per chunk. Empty list if chunks is empty.
|
||||
|
|
@ -55,6 +67,11 @@ def extract_metadata(
|
|||
f"chunk_file_paths length ({len(chunk_file_paths)}) does not match chunks length ({len(chunks)})"
|
||||
)
|
||||
|
||||
if chunk_metadata is not None and len(chunk_metadata) > 0 and len(chunk_metadata) != len(chunks):
|
||||
raise ValueError(
|
||||
f"chunk_metadata length ({len(chunk_metadata)}) does not match chunks length ({len(chunks)})"
|
||||
)
|
||||
|
||||
filename = original_filename if original_filename else os.path.basename(file_path)
|
||||
upload_date = datetime.now().isoformat()
|
||||
|
||||
|
|
@ -68,6 +85,7 @@ def extract_metadata(
|
|||
"content_summary": content_summary,
|
||||
"chunk_index": idx,
|
||||
"document_id": document_id,
|
||||
"strategy_type": strategy_type,
|
||||
}
|
||||
page_num = page_numbers[idx] if page_numbers else None
|
||||
if page_num is not None:
|
||||
|
|
@ -75,6 +93,8 @@ def extract_metadata(
|
|||
cfp = chunk_file_paths[idx] if chunk_file_paths else None
|
||||
if cfp is not None:
|
||||
entry["chunk_file_path"] = cfp
|
||||
if chunk_metadata:
|
||||
entry.update(chunk_metadata[idx])
|
||||
metadata.append(entry)
|
||||
|
||||
return metadata
|
||||
|
|
|
|||
|
|
@ -0,0 +1,361 @@
|
|||
"""Q&A-pair chunking utilities for Package 8.
|
||||
|
||||
Provides section detection (LLM + regex), text preprocessing,
|
||||
and chunk building for LegCo documents with Q&A structure.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Section:
|
||||
"""A detected section within a LegCo document."""
|
||||
type: str # "qa" | "narrative" | "speaking_notes" | "table" | "toc" | "heading_only"
|
||||
heading: str = ""
|
||||
qa_id: Optional[str] = None
|
||||
question: Optional[str] = None
|
||||
answer: Optional[str] = None
|
||||
content: str = ""
|
||||
start_page: int = 1
|
||||
end_page: int = 1
|
||||
has_table: bool = False
|
||||
parent_topic: str = ""
|
||||
|
||||
|
||||
_FOOTER_RE = re.compile(r"^[A-Z]-\d+\s*$", re.MULTILINE)
|
||||
_FOOTER_DATE_RE = re.compile(r"^[A-Z]-\d+\s*\n\d{4}-\d{2}-\d{2}$", re.MULTILINE)
|
||||
_HEADER_LETTER_RE = re.compile(r"^(\([A-Z]\))\s*$", re.MULTILINE)
|
||||
_FULLWIDTH_COLON_RE = re.compile("[︰:]")
|
||||
|
||||
|
||||
def preprocess_text(pages: List[Tuple[int, str]]) -> str:
|
||||
"""Concatenate pages, strip footers/headers, normalize colons, insert [PAGE_BREAK: N] markers."""
|
||||
parts: List[str] = []
|
||||
for idx, (page_num, page_text) in enumerate(pages):
|
||||
text = _FOOTER_DATE_RE.sub("", page_text)
|
||||
text = _FOOTER_RE.sub("", text)
|
||||
if idx > 0:
|
||||
text = _HEADER_LETTER_RE.sub("", text)
|
||||
text = _FULLWIDTH_COLON_RE.sub(":", text)
|
||||
parts.append(f"[PAGE_BREAK: {page_num}]\n{text}")
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
_STRUCTURE_PROMPT_TEMPLATE = """You are analyzing a Hong Kong Legislative Council document.
|
||||
The text has page markers like [PAGE_BREAK: N] showing where pages begin.
|
||||
|
||||
For each distinct section in this document, identify:
|
||||
1. The section type:
|
||||
- "qa": a question-and-answer pair (問/答 or Q1/Q2 format)
|
||||
- "narrative": policy text, explanatory paragraphs, section content with bullets
|
||||
- "speaking_notes": briefing points (發言要點) with bullet markers
|
||||
- "table": standalone data tables (not embedded in answers)
|
||||
- "toc": table of contents
|
||||
- "heading_only": a section heading with no following content
|
||||
|
||||
2. For "qa" sections:
|
||||
- The question text (exact)
|
||||
- The answer text (exact, including tables, bullet lists, and [內部參考] content)
|
||||
- The question ID if present (e.g. "A1", "Q3")
|
||||
- The start page and end page
|
||||
|
||||
3. For all sections:
|
||||
- The section heading (e.g. "(A) 排水系統", "(1) 住戶的安置補償")
|
||||
- The start page and end page
|
||||
- Whether the section contains tables
|
||||
|
||||
Return JSON:
|
||||
{{
|
||||
"sections": [
|
||||
{{
|
||||
"type": "qa",
|
||||
"heading": "(A) 排水系統",
|
||||
"qa_id": "A1",
|
||||
"question": "...",
|
||||
"answer": "...",
|
||||
"start_page": 2,
|
||||
"end_page": 3,
|
||||
"has_table": true,
|
||||
"parent_topic": "排水系統"
|
||||
}},
|
||||
{{
|
||||
"type": "narrative",
|
||||
"heading": "(1) 住戶的安置補償",
|
||||
"content": "...",
|
||||
"start_page": 2,
|
||||
"end_page": 5,
|
||||
"has_table": false
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
DOCUMENT TEXT:
|
||||
{document_text}"""
|
||||
|
||||
|
||||
def build_structure_detection_prompt(text: str) -> str:
|
||||
"""Construct the LLM prompt for section classification."""
|
||||
return _STRUCTURE_PROMPT_TEMPLATE.format(document_text=text)
|
||||
|
||||
|
||||
_MARKDOWN_FENCE_RE = re.compile(r"```(?:json)?\s*\n?(.*?)\n?```", re.DOTALL)
|
||||
|
||||
|
||||
def parse_llm_structure_response(response_text: str) -> List[Section]:
|
||||
"""Parse the JSON returned by the LLM. Handle markdown code fences.
|
||||
|
||||
Raises ValueError if response is not valid JSON.
|
||||
"""
|
||||
cleaned = response_text.strip()
|
||||
fence_match = _MARKDOWN_FENCE_RE.search(cleaned)
|
||||
if fence_match:
|
||||
cleaned = fence_match.group(1).strip()
|
||||
|
||||
try:
|
||||
data = json.loads(cleaned)
|
||||
except json.JSONDecodeError as exc:
|
||||
raise ValueError(f"Invalid JSON from LLM structure detection: {exc}") from exc
|
||||
|
||||
sections_raw = data.get("sections", [])
|
||||
sections: List[Section] = []
|
||||
for raw in sections_raw:
|
||||
sections.append(Section(
|
||||
type=raw.get("type", "narrative"),
|
||||
heading=raw.get("heading", ""),
|
||||
qa_id=raw.get("qa_id"),
|
||||
question=raw.get("question"),
|
||||
answer=raw.get("answer"),
|
||||
content=raw.get("content", ""),
|
||||
start_page=raw.get("start_page", 1),
|
||||
end_page=raw.get("end_page", 1),
|
||||
has_table=raw.get("has_table", False),
|
||||
parent_topic=raw.get("parent_topic", ""),
|
||||
))
|
||||
return sections
|
||||
|
||||
|
||||
_CN_QA_RE = re.compile(
|
||||
r"問\s*([A-Z]\d+)\s*[︰::]\s*(.*?)\s*"
|
||||
r"(?:\n\s*答\s*\1\s*[︰::]\s*(.*?)\s*)"
|
||||
r"(?=\n\s*(?:問\s*[A-Z]\d+\s*[︰::]|$))",
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
|
||||
def split_chinese_qa(text: str) -> List[Section]:
|
||||
"""Regex fast-pass for 問/答 format. Returns empty list if no matches found."""
|
||||
sections: List[Section] = []
|
||||
for m in _CN_QA_RE.finditer(text):
|
||||
qa_id = m.group(1)
|
||||
question = m.group(2).strip()
|
||||
answer = (m.group(3) or "").strip()
|
||||
sections.append(Section(
|
||||
type="qa",
|
||||
qa_id=qa_id,
|
||||
question=question,
|
||||
answer=answer,
|
||||
))
|
||||
return sections
|
||||
|
||||
|
||||
_EN_QA_RE = re.compile(
|
||||
r"^(Q\d+)\s+(.*?)\s*$\n((?:(?!^Q\d+).+(?:\n|$))*)",
|
||||
re.MULTILINE,
|
||||
)
|
||||
|
||||
|
||||
def split_english_qa(text: str) -> List[Section]:
|
||||
"""Regex fast-pass for Q-number format. Returns empty list if no matches found."""
|
||||
sections: List[Section] = []
|
||||
for m in _EN_QA_RE.finditer(text):
|
||||
qa_id = m.group(1)
|
||||
question = m.group(2).strip()
|
||||
answer = m.group(3).strip()
|
||||
sections.append(Section(
|
||||
type="qa",
|
||||
qa_id=qa_id,
|
||||
question=question,
|
||||
answer=answer,
|
||||
))
|
||||
return sections
|
||||
|
||||
|
||||
def _estimate_tokens(text: str) -> int:
|
||||
"""Rough token estimate: ~1.3 tokens per CJK char, ~1 token per 4 chars for Latin."""
|
||||
cjk_count = 0
|
||||
latin_len = 0
|
||||
for ch in text:
|
||||
if "\u4e00" <= ch <= "\u9fff":
|
||||
cjk_count += 1
|
||||
else:
|
||||
latin_len += 1
|
||||
return int(cjk_count * 1.3 + latin_len / 4)
|
||||
|
||||
|
||||
def _split_oversized_qa(
|
||||
question: str, answer: str, page: int, heading: str,
|
||||
qa_id: Optional[str], question_index: int, has_table: bool,
|
||||
parent_topic: str, start_page: int, end_page: int,
|
||||
max_tokens: int,
|
||||
) -> List[Tuple[str, int, dict]]:
|
||||
"""Recursively split an oversized Q&A answer with question prepended to each sub-chunk."""
|
||||
# Try paragraph boundaries first
|
||||
parts = answer.split("\n\n")
|
||||
if len(parts) <= 1:
|
||||
parts = answer.split("\n")
|
||||
|
||||
# Group parts into sub-chunks that fit within max_tokens
|
||||
sub_chunks: List[str] = []
|
||||
current = ""
|
||||
for part in parts:
|
||||
candidate = (current + "\n\n" + part) if current else part
|
||||
if _estimate_tokens(f"Question: {question}\n\nAnswer (part 1/N): {candidate}") > max_tokens and current:
|
||||
sub_chunks.append(current)
|
||||
current = part
|
||||
else:
|
||||
current = candidate
|
||||
if current:
|
||||
sub_chunks.append(current)
|
||||
|
||||
total = len(sub_chunks)
|
||||
results: List[Tuple[str, int, dict]] = []
|
||||
for i, sub in enumerate(sub_chunks):
|
||||
chunk_text = f"Question: {question}\n\nAnswer (part {i + 1}/{total}): {sub}"
|
||||
meta = {
|
||||
"strategy_type": "question",
|
||||
"section_type": "qa",
|
||||
"question_index": question_index,
|
||||
"question_id": qa_id,
|
||||
"question_text": question,
|
||||
"section_heading": heading,
|
||||
"answer_contains_table": has_table,
|
||||
"source_page_range": [start_page, end_page],
|
||||
"parent_topic": parent_topic,
|
||||
}
|
||||
results.append((chunk_text, page, meta))
|
||||
return results
|
||||
|
||||
|
||||
def build_chunks_from_sections(
|
||||
sections: List[Section], max_tokens: int = 3000,
|
||||
) -> List[Tuple[str, int, dict]]:
|
||||
"""Build chunk texts + page refs + metadata from sections.
|
||||
|
||||
Returns List[(chunk_text, page_number, metadata_dict)].
|
||||
"""
|
||||
chunks: List[Tuple[str, int, dict]] = []
|
||||
qa_index = 0
|
||||
|
||||
for section in sections:
|
||||
if section.type in ("toc", "heading_only"):
|
||||
continue
|
||||
|
||||
if section.type == "qa":
|
||||
question_text = section.question or ""
|
||||
answer_text = section.answer or ""
|
||||
chunk_text = f"Question: {question_text}\n\nAnswer: {answer_text}"
|
||||
|
||||
if section.heading:
|
||||
chunk_text = f"[{section.heading}]\n{chunk_text}"
|
||||
|
||||
page = section.start_page
|
||||
meta: Dict = {
|
||||
"strategy_type": "question",
|
||||
"section_type": "qa",
|
||||
"question_index": qa_index,
|
||||
"question_id": section.qa_id,
|
||||
"question_text": question_text,
|
||||
"section_heading": section.heading,
|
||||
"answer_contains_table": section.has_table,
|
||||
"source_page_range": [section.start_page, section.end_page],
|
||||
"parent_topic": section.parent_topic,
|
||||
}
|
||||
|
||||
if _estimate_tokens(chunk_text) > max_tokens:
|
||||
chunks.extend(_split_oversized_qa(
|
||||
question=question_text,
|
||||
answer=answer_text,
|
||||
page=page,
|
||||
heading=section.heading,
|
||||
qa_id=section.qa_id,
|
||||
question_index=qa_index,
|
||||
has_table=section.has_table,
|
||||
parent_topic=section.parent_topic,
|
||||
start_page=section.start_page,
|
||||
end_page=section.end_page,
|
||||
max_tokens=max_tokens,
|
||||
))
|
||||
else:
|
||||
chunks.append((chunk_text, page, meta))
|
||||
|
||||
qa_index += 1
|
||||
|
||||
elif section.type == "narrative":
|
||||
content = section.content
|
||||
if not content.strip():
|
||||
continue
|
||||
prefix = f"[{section.heading}]\n" if section.heading else ""
|
||||
chunk_text = f"{prefix}{content}"
|
||||
meta = {
|
||||
"strategy_type": "question",
|
||||
"section_type": "narrative",
|
||||
"section_heading": section.heading,
|
||||
"source_page_range": [section.start_page, section.end_page],
|
||||
}
|
||||
if _estimate_tokens(chunk_text) <= max_tokens:
|
||||
chunks.append((chunk_text, section.start_page, meta))
|
||||
else:
|
||||
paragraphs = content.split("\n\n")
|
||||
current = ""
|
||||
for para in paragraphs:
|
||||
candidate = (current + "\n\n" + para) if current else para
|
||||
full = f"{prefix}{candidate}"
|
||||
if _estimate_tokens(full) > max_tokens and current:
|
||||
chunks.append((f"{prefix}{current}", section.start_page, dict(meta)))
|
||||
current = para
|
||||
else:
|
||||
current = candidate
|
||||
if current:
|
||||
chunks.append((f"{prefix}{current}", section.start_page, dict(meta)))
|
||||
|
||||
elif section.type == "speaking_notes":
|
||||
content = section.content
|
||||
if not content.strip():
|
||||
continue
|
||||
bullets = re.split(r"(?=⚫)", content)
|
||||
bullets = [b.strip() for b in bullets if b.strip()]
|
||||
if not bullets:
|
||||
bullets = [content]
|
||||
prefix = f"[{section.heading}]\n" if section.heading else ""
|
||||
for bullet in bullets:
|
||||
chunk_text = f"{prefix}{bullet}"
|
||||
meta = {
|
||||
"strategy_type": "question",
|
||||
"section_type": "speaking_notes",
|
||||
"section_heading": section.heading,
|
||||
"source_page_range": [section.start_page, section.end_page],
|
||||
}
|
||||
chunks.append((chunk_text, section.start_page, meta))
|
||||
|
||||
elif section.type == "table":
|
||||
content = section.content
|
||||
if not content.strip():
|
||||
continue
|
||||
chunk_text = f"[{section.heading}]\n{content}" if section.heading else content
|
||||
meta = {
|
||||
"strategy_type": "question",
|
||||
"section_type": "table",
|
||||
"section_heading": section.heading,
|
||||
"answer_contains_table": True,
|
||||
"source_page_range": [section.start_page, section.end_page],
|
||||
}
|
||||
chunks.append((chunk_text, section.start_page, meta))
|
||||
|
||||
return chunks
|
||||
|
|
@ -0,0 +1,147 @@
|
|||
"""Table extraction utilities for Package 8.
|
||||
|
||||
Provides vision-based and text-based table detection and markdown conversion
|
||||
for LegCo documents. Uses the existing LLM model (vision-capable) for
|
||||
table-to-markdown conversion.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_CACHE_DIR = Path(__file__).resolve().parent.parent.parent / ".cache" / "vision_tables"
|
||||
|
||||
|
||||
async def extract_tables_vision(page_images: List[str], llm_client) -> List[str]:
|
||||
"""Send page images to vision LLM, get back markdown tables.
|
||||
|
||||
Each page_image is a base64-encoded PNG string.
|
||||
Uses the existing LLM model which supports vision input.
|
||||
"""
|
||||
results: List[str] = []
|
||||
prompt = (
|
||||
"Convert this page to Markdown. For any tables:\n"
|
||||
"- Use proper markdown table syntax with |---|---| alignment\n"
|
||||
"- Preserve all column headers and row labels\n"
|
||||
"- Do not modify or translate the content\n"
|
||||
"- If a table spans multiple pages, note it"
|
||||
)
|
||||
for idx, img_b64 in enumerate(page_images):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/png;base64,{img_b64}"},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
try:
|
||||
response = await llm_client._client.chat.completions.create(
|
||||
model=llm_client.model,
|
||||
messages=messages,
|
||||
temperature=0.1,
|
||||
)
|
||||
content = response.choices[0].message.content or ""
|
||||
if content.strip():
|
||||
results.append(content.strip())
|
||||
except Exception:
|
||||
logger.warning("Vision table extraction failed for page image %d", idx, exc_info=True)
|
||||
return results
|
||||
|
||||
|
||||
_TABLE_HEURISTIC_RE = [
|
||||
r"(?:\|[\s\-:]+\|)",
|
||||
r"(?:\+[-=]+\+)",
|
||||
r"(?:(?:\S+\s{2,}){3,}\n)",
|
||||
]
|
||||
|
||||
_TABLE_REGION_PROMPT = (
|
||||
"Convert this raw table text extracted from a PDF into a markdown table.\n"
|
||||
"Preserve all data exactly. Detect column boundaries and alignment.\n\n"
|
||||
"{table_text}"
|
||||
)
|
||||
|
||||
|
||||
async def extract_tables_text(text: str, llm_client) -> List[str]:
|
||||
"""Detect table-like text regions, send to LLM for markdown conversion."""
|
||||
import re
|
||||
|
||||
regions: List[str] = []
|
||||
lines = text.split("\n")
|
||||
current_region: List[str] = []
|
||||
in_table = False
|
||||
|
||||
for line in lines:
|
||||
is_table_line = any(re.search(pat, line) for pat in _TABLE_HEURISTIC_RE)
|
||||
if is_table_line:
|
||||
in_table = True
|
||||
current_region.append(line)
|
||||
elif in_table and line.strip():
|
||||
current_region.append(line)
|
||||
else:
|
||||
if len(current_region) >= 3:
|
||||
regions.append("\n".join(current_region))
|
||||
current_region = []
|
||||
in_table = False
|
||||
|
||||
if len(current_region) >= 3:
|
||||
regions.append("\n".join(current_region))
|
||||
|
||||
if not regions:
|
||||
return []
|
||||
|
||||
results: List[str] = []
|
||||
for region in regions:
|
||||
prompt = _TABLE_REGION_PROMPT.format(table_text=region)
|
||||
try:
|
||||
response = await llm_client.complete(prompt, temperature=0.1, step_name="TableExtraction")
|
||||
if response.strip():
|
||||
results.append(response.strip())
|
||||
except Exception:
|
||||
logger.warning("Text-based table extraction failed", exc_info=True)
|
||||
return results
|
||||
|
||||
|
||||
def inject_tables_into_answer(answer: str, tables_md: List[str]) -> str:
|
||||
"""Replace raw table text regions in answer with markdown tables."""
|
||||
if not tables_md:
|
||||
return answer
|
||||
result = answer
|
||||
for table_md in tables_md:
|
||||
lines = table_md.split("\n")
|
||||
if not lines:
|
||||
continue
|
||||
header_line = lines[0]
|
||||
if header_line.strip() in result:
|
||||
result = result.replace(header_line.strip(), table_md)
|
||||
return result
|
||||
|
||||
|
||||
def cache_vision_result(page_hash: str) -> Optional[str]:
|
||||
"""Simple disk cache: hash→markdown stored in .cache dir. Returns None on miss."""
|
||||
cache_file = _CACHE_DIR / f"{page_hash}.md"
|
||||
if cache_file.exists():
|
||||
return cache_file.read_text(encoding="utf-8")
|
||||
return None
|
||||
|
||||
|
||||
def save_vision_result(page_hash: str, markdown: str) -> None:
|
||||
"""Save a vision result to the disk cache."""
|
||||
_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
cache_file = _CACHE_DIR / f"{page_hash}.md"
|
||||
cache_file.write_text(markdown, encoding="utf-8")
|
||||
|
||||
|
||||
def compute_page_hash(page_image_b64: str) -> str:
|
||||
"""Compute a hash for a page image for cache key purposes."""
|
||||
return hashlib.sha256(page_image_b64.encode("utf-8")).hexdigest()[:16]
|
||||
|
|
@ -56,9 +56,32 @@ export const ChunkList: React.FC<ChunkListProps> = ({
|
|||
<span className="text-xs font-medium text-gray-500 uppercase">
|
||||
Chunk {chunk.chunk_index}
|
||||
</span>
|
||||
<span className="text-xs text-gray-400">
|
||||
Page: {chunk.page_number !== null ? chunk.page_number : 'N/A'}
|
||||
</span>
|
||||
{chunk.strategy_type === 'question' && chunk.question_id ? (
|
||||
<>
|
||||
<span className="text-xs text-gray-600">
|
||||
Q: {chunk.question_id}{chunk.question_text ? ` — ${chunk.question_text.length > 80 ? `${chunk.question_text.slice(0, 80)}...` : chunk.question_text}` : ''}
|
||||
</span>
|
||||
{chunk.topic_section && (
|
||||
<span className="text-xs text-gray-500">
|
||||
Topic: {chunk.topic_section}
|
||||
</span>
|
||||
)}
|
||||
{chunk.source_page_range && chunk.source_page_range.length === 2 && (
|
||||
<span className="text-xs text-gray-400">
|
||||
Pages {chunk.source_page_range[0]}-{chunk.source_page_range[1]}
|
||||
</span>
|
||||
)}
|
||||
{chunk.has_table && (
|
||||
<span className="inline-flex items-center px-1.5 py-0.5 rounded text-xs font-medium bg-amber-100 text-amber-700">
|
||||
Contains table
|
||||
</span>
|
||||
)}
|
||||
</>
|
||||
) : (
|
||||
<span className="text-xs text-gray-400">
|
||||
Page: {chunk.page_number !== null ? chunk.page_number : 'N/A'}
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
<div className="text-sm text-gray-700 truncate" title={chunk.content_summary}>
|
||||
{chunk.content_summary.length > 100
|
||||
|
|
@ -67,7 +90,12 @@ export const ChunkList: React.FC<ChunkListProps> = ({
|
|||
</div>
|
||||
{chunk.chunk_file_path && (
|
||||
<a
|
||||
href={getPdfViewerUrl(chunk.chunk_file_path, chunk.page_number ?? undefined)}
|
||||
href={getPdfViewerUrl(
|
||||
chunk.chunk_file_path,
|
||||
chunk.strategy_type === 'question' && chunk.source_page_range && chunk.source_page_range.length > 0
|
||||
? chunk.source_page_range[0]
|
||||
: chunk.page_number ?? undefined
|
||||
)}
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="inline-flex items-center mt-1 text-xs text-blue-600 hover:text-blue-800 hover:underline"
|
||||
|
|
|
|||
|
|
@ -29,7 +29,18 @@ export const DocumentList: React.FC<DocumentListProps> = ({
|
|||
<div className="flex items-center space-x-3 flex-1">
|
||||
<FileText className="w-5 h-5 text-gray-500 flex-shrink-0" />
|
||||
<div className="flex-1 min-w-0">
|
||||
<div className="font-medium text-gray-900 truncate">{doc.filename}</div>
|
||||
<div className="flex items-center space-x-2">
|
||||
<span className="font-medium text-gray-900 truncate">{doc.filename}</span>
|
||||
{doc.chunking_strategy === 'question' ? (
|
||||
<span className="inline-flex items-center px-2 py-0.5 rounded text-xs font-medium bg-blue-100 text-blue-700">
|
||||
chunked by question
|
||||
</span>
|
||||
) : (
|
||||
<span className="inline-flex items-center px-2 py-0.5 rounded text-xs font-medium bg-gray-100 text-gray-700">
|
||||
chunked by token
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
<div className="text-sm text-gray-500">
|
||||
{doc.chunk_count} chunks • Uploaded {doc.upload_date}
|
||||
</div>
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import axios from 'axios'
|
||||
import type { QueryRequest, QueryResponse, QueryStreamEvent, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, FullTranscriptResponse, VideoUploadResponse } from '../types'
|
||||
import type { ChunkingStrategy, QueryRequest, QueryResponse, QueryStreamEvent, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, FullTranscriptResponse, VideoUploadResponse } from '../types'
|
||||
|
||||
const BASE_URL: string = import.meta.env.VITE_API_BASE_URL ?? 'http://localhost:8000/api/v1'
|
||||
|
||||
|
|
@ -48,10 +48,10 @@ export const queryDocumentStream = async (
|
|||
}
|
||||
}
|
||||
|
||||
export const ingestDocument = async (file: File): Promise<IngestResponse> => {
|
||||
export const ingestDocument = async (file: File, strategy: ChunkingStrategy = 'token'): Promise<IngestResponse> => {
|
||||
const form = new FormData()
|
||||
form.append('file', file)
|
||||
const resp = await apiClient.post<IngestResponse>('/ingest', form, {
|
||||
const resp = await apiClient.post<IngestResponse>(`/ingest?strategy=${strategy}`, form, {
|
||||
headers: { 'Content-Type': 'multipart/form-data' },
|
||||
})
|
||||
return resp.data
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import React from 'react'
|
||||
import { QueryClient, QueryClientProvider, useMutation, useQuery, useQueryClient } from '@tanstack/react-query'
|
||||
import { queryDocument, queryDocumentStream, ingestDocument, listDocuments, listChunks, deleteDocument, deleteChunk, listPromptProfiles, getPromptProfile, activatePromptProfile, updatePrompt, updateAllPrompts, resetPrompts, exportProfile, importProfile, listQueryHistory, getQueryHistoryDetail, deleteQueryHistory, clearQueryHistory, getHistoryStats, uploadVideo } from './api'
|
||||
import type { QueryRequest, QueryResponse, QueryStreamEvent, SourceMetadata, SubQuestionSources, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, VideoUploadResponse } from '../types'
|
||||
import type { QueryRequest, QueryResponse, QueryStreamEvent, SourceMetadata, SubQuestionSources, ChunkingStrategy, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, VideoUploadResponse } from '../types'
|
||||
import { useState, useCallback, useRef } from 'react'
|
||||
|
||||
export const queryClient = new QueryClient()
|
||||
|
|
@ -185,8 +185,8 @@ export const useQueryDocumentStream = () => {
|
|||
}
|
||||
|
||||
export const useIngestDocument = () => {
|
||||
return useMutation<IngestResponse, Error, File>({
|
||||
mutationFn: ingestDocument,
|
||||
return useMutation<IngestResponse, Error, { file: File; strategy: ChunkingStrategy }>({
|
||||
mutationFn: ({ file, strategy }) => ingestDocument(file, strategy),
|
||||
})
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,10 +1,11 @@
|
|||
import React, { useState, useCallback, useMemo } from 'react'
|
||||
import { Database, AlertCircle, CheckCircle, XCircle, Loader2 } from 'lucide-react'
|
||||
import { Database, AlertCircle, CheckCircle, XCircle, Loader2, Type, MessageSquare } from 'lucide-react'
|
||||
import { useQueryClient } from '@tanstack/react-query'
|
||||
import { useDocuments, useDocumentChunks, useDeleteDocument, useDeleteChunk, useIngestDocument } from '../lib/queries'
|
||||
import { DocumentList } from '../components/DocumentList'
|
||||
import { ChunkList } from '../components/ChunkList'
|
||||
import { DocumentUpload } from '../components/DocumentUpload'
|
||||
import type { ChunkingStrategy } from '../types'
|
||||
|
||||
interface FileUploadEntry {
|
||||
name: string
|
||||
|
|
@ -22,6 +23,7 @@ export const RAGDatabasePage: React.FC = () => {
|
|||
const initialDocId = useMemo(() => getDocumentIdFromUrl(), [])
|
||||
const [expandedId, setExpandedId] = useState<string | null>(initialDocId)
|
||||
const [uploadEntries, setUploadEntries] = useState<FileUploadEntry[]>([])
|
||||
const [chunkingStrategy, setChunkingStrategy] = useState<ChunkingStrategy>('token')
|
||||
|
||||
const { data: documentsData, isLoading: isLoadingDocuments, error: documentsError } = useDocuments()
|
||||
const { data: chunks, isLoading: isLoadingChunks } = useDocumentChunks(expandedId)
|
||||
|
|
@ -60,7 +62,7 @@ export const RAGDatabasePage: React.FC = () => {
|
|||
const results = await Promise.allSettled(
|
||||
files.map(async (file) => {
|
||||
try {
|
||||
await ingestDocumentMutation.mutateAsync(file)
|
||||
await ingestDocumentMutation.mutateAsync({ file, strategy: chunkingStrategy })
|
||||
setUploadEntries((prev) =>
|
||||
prev.map((e) =>
|
||||
e.name === file.name ? { ...e, status: 'success' as const } : e
|
||||
|
|
@ -80,7 +82,7 @@ export const RAGDatabasePage: React.FC = () => {
|
|||
|
||||
queryClient.invalidateQueries({ queryKey: ['documents'] })
|
||||
setTimeout(() => setUploadEntries([]), 5000)
|
||||
}, [ingestDocumentMutation, queryClient])
|
||||
}, [ingestDocumentMutation, queryClient, chunkingStrategy])
|
||||
|
||||
const uploadingCount = uploadEntries.filter((e) => e.status === 'uploading').length
|
||||
const successCount = uploadEntries.filter((e) => e.status === 'success').length
|
||||
|
|
@ -127,6 +129,42 @@ export const RAGDatabasePage: React.FC = () => {
|
|||
/>
|
||||
</div>
|
||||
|
||||
<div className="mt-3 flex items-center space-x-4">
|
||||
<span className="text-sm font-medium text-gray-700">Chunking strategy:</span>
|
||||
<div className="flex items-center space-x-3">
|
||||
<label className="flex items-center space-x-2 cursor-pointer">
|
||||
<input
|
||||
type="radio"
|
||||
name="chunking-strategy"
|
||||
value="token"
|
||||
checked={chunkingStrategy === 'token'}
|
||||
onChange={() => setChunkingStrategy('token')}
|
||||
className="h-4 w-4 text-blue-600 border-gray-300 focus:ring-blue-500"
|
||||
/>
|
||||
<Type className="w-4 h-4 text-gray-500" />
|
||||
<div>
|
||||
<span className="text-sm font-medium text-gray-900">Chunk by Token</span>
|
||||
<span className="text-xs text-gray-500 ml-1">Split by token windows with overlap</span>
|
||||
</div>
|
||||
</label>
|
||||
<label className="flex items-center space-x-2 cursor-pointer">
|
||||
<input
|
||||
type="radio"
|
||||
name="chunking-strategy"
|
||||
value="question"
|
||||
checked={chunkingStrategy === 'question'}
|
||||
onChange={() => setChunkingStrategy('question')}
|
||||
className="h-4 w-4 text-blue-600 border-gray-300 focus:ring-blue-500"
|
||||
/>
|
||||
<MessageSquare className="w-4 h-4 text-gray-500" />
|
||||
<div>
|
||||
<span className="text-sm font-medium text-gray-900">Chunk by Question</span>
|
||||
<span className="text-xs text-gray-500 ml-1">Detect Q&A pairs, extract tables</span>
|
||||
</div>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{hasEntries && (
|
||||
<div className="mt-4 space-y-2">
|
||||
<div className="text-sm font-medium text-gray-600">
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
export type ChunkingStrategy = 'token' | 'question'
|
||||
|
||||
export interface SourceMetadata {
|
||||
filename: string
|
||||
upload_date: string
|
||||
|
|
@ -40,6 +42,7 @@ export interface IngestResponse {
|
|||
document_id: string
|
||||
chunk_count: number
|
||||
filename: string
|
||||
strategy: ChunkingStrategy
|
||||
}
|
||||
|
||||
export interface DocumentInfo {
|
||||
|
|
@ -47,6 +50,7 @@ export interface DocumentInfo {
|
|||
filename: string
|
||||
chunk_count: number
|
||||
upload_date: string
|
||||
chunking_strategy: ChunkingStrategy
|
||||
}
|
||||
|
||||
export interface ChunkInfo {
|
||||
|
|
@ -55,6 +59,13 @@ export interface ChunkInfo {
|
|||
content_summary: string
|
||||
page_number: number | null
|
||||
chunk_file_path: string | null
|
||||
strategy_type: ChunkingStrategy
|
||||
question_index: number | null
|
||||
question_id: string | null
|
||||
question_text: string | null
|
||||
topic_section: string | null
|
||||
source_page_range: number[] | null
|
||||
has_table: boolean | null
|
||||
}
|
||||
|
||||
export interface DocumentListResponse {
|
||||
|
|
|
|||
Loading…
Reference in New Issue