feat: Sub-Phases 8.1-8.4 — Q&A-pair chunking strategy
8.1 — Core algorithm (test-first): - qa_chunking.py: preprocess_text, build_structure_detection_prompt, parse_llm_structure_response, Section dataclass, split_chinese_qa, split_english_qa, build_chunks_from_sections with recursive size split - QuestionChunkingStrategy in chunking.py with _chunk_metadata tracking - get_chunking_strategy() factory function - table_extraction.py: vision LLM extraction, heuristic text fallback, disk cache, inject_tables_into_answer - 18/18 tests pass (LLM parse, regex fast-pass, multi-page, ABC contract, size limit, chunk building, preprocess) 8.2 — Metadata enrichment: - extract_metadata() accepts strategy_type + chunk_metadata params - Q&A fields (question_id, question_index, section_heading, etc.) merged into ChromaDB metadata entries - DocumentInfo.chunking_strategy + ChunkInfo Q&A fields in models - 6/6 metadata tests pass 8.3 — Ingest API integration: - POST /api/v1/ingest accepts ?strategy=token|question - validate strategy against VALID_CHUNKING_STRATEGIES - factory creates correct chunker; _chunk_metadata passed to extract_metadata - 6/6 ingest integration tests pass, zero regressions on existing tests 8.4 — Frontend strategy selector: - Radio button selector (Token / Question) on RAG Database page - Strategy passed to ingest mutation via api.ts - DocumentList: strategy badge (gray/blue) - ChunkList: Q&A display with question_id, question_text, page range, table badge - tsc --noEmit clean, vite build successful
This commit is contained in:
parent
ef10b937cf
commit
14423c773a
|
|
@ -8,6 +8,7 @@ class DocumentInfo(BaseModel):
|
||||||
filename: str
|
filename: str
|
||||||
chunk_count: int
|
chunk_count: int
|
||||||
upload_date: str
|
upload_date: str
|
||||||
|
chunking_strategy: str = "token"
|
||||||
|
|
||||||
|
|
||||||
class ChunkInfo(BaseModel):
|
class ChunkInfo(BaseModel):
|
||||||
|
|
@ -16,6 +17,14 @@ class ChunkInfo(BaseModel):
|
||||||
content_summary: str
|
content_summary: str
|
||||||
page_number: Optional[int] = None
|
page_number: Optional[int] = None
|
||||||
chunk_file_path: Optional[str] = None
|
chunk_file_path: Optional[str] = None
|
||||||
|
strategy_type: Optional[str] = None
|
||||||
|
question_index: Optional[int] = None
|
||||||
|
question_id: Optional[str] = None
|
||||||
|
question_text: Optional[str] = None
|
||||||
|
section_heading: Optional[str] = None
|
||||||
|
answer_contains_table: Optional[bool] = None
|
||||||
|
source_page_range: Optional[List[int]] = None
|
||||||
|
parent_topic: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
class DocumentListResponse(BaseModel):
|
class DocumentListResponse(BaseModel):
|
||||||
|
|
|
||||||
|
|
@ -5,9 +5,9 @@ import tempfile
|
||||||
import uuid
|
import uuid
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from fastapi import APIRouter, UploadFile, File, HTTPException
|
from fastapi import APIRouter, UploadFile, File, HTTPException, Query
|
||||||
|
|
||||||
from app.models.ingest import IngestResponse
|
from app.models.ingest import IngestResponse, VALID_CHUNKING_STRATEGIES
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
router = APIRouter(tags=["ingest"])
|
router = APIRouter(tags=["ingest"])
|
||||||
|
|
@ -37,11 +37,14 @@ def _delete_existing_document(rag, filename: str, chunk_dir: str) -> None:
|
||||||
|
|
||||||
|
|
||||||
@router.post("/ingest", response_model=IngestResponse)
|
@router.post("/ingest", response_model=IngestResponse)
|
||||||
async def ingest_document(file: UploadFile = File(...)):
|
async def ingest_document(
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
strategy: str = Query("token"),
|
||||||
|
):
|
||||||
"""Ingest a document into the RAG system."""
|
"""Ingest a document into the RAG system."""
|
||||||
from app.core.config import get_settings
|
from app.core.config import get_settings
|
||||||
from app.services.rag import RAGService
|
from app.services.rag import RAGService
|
||||||
from app.utils.chunking import TokenChunkingStrategy
|
from app.utils.chunking import get_chunking_strategy
|
||||||
from app.utils.metadata import extract_metadata
|
from app.utils.metadata import extract_metadata
|
||||||
|
|
||||||
filename = file.filename or "unknown"
|
filename = file.filename or "unknown"
|
||||||
|
|
@ -53,6 +56,12 @@ async def ingest_document(file: UploadFile = File(...)):
|
||||||
detail=f"Unsupported file format: {file_ext}. Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}",
|
detail=f"Unsupported file format: {file_ext}. Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if strategy not in VALID_CHUNKING_STRATEGIES:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Invalid chunking strategy: {strategy}. Valid: {', '.join(sorted(VALID_CHUNKING_STRATEGIES))}",
|
||||||
|
)
|
||||||
|
|
||||||
settings = get_settings()
|
settings = get_settings()
|
||||||
temp_path = None
|
temp_path = None
|
||||||
try:
|
try:
|
||||||
|
|
@ -68,9 +77,7 @@ async def ingest_document(file: UploadFile = File(...)):
|
||||||
_delete_existing_document(rag, filename, chunk_dir)
|
_delete_existing_document(rag, filename, chunk_dir)
|
||||||
|
|
||||||
document_id = str(uuid.uuid4())
|
document_id = str(uuid.uuid4())
|
||||||
chunker = TokenChunkingStrategy(
|
chunker = get_chunking_strategy(strategy, settings)
|
||||||
chunk_size=settings.chunk_size, overlap=settings.chunk_overlap
|
|
||||||
)
|
|
||||||
|
|
||||||
if file_ext == ".pdf":
|
if file_ext == ".pdf":
|
||||||
from app.utils.pdf_parser import parse_pdf_by_page
|
from app.utils.pdf_parser import parse_pdf_by_page
|
||||||
|
|
@ -105,6 +112,8 @@ async def ingest_document(file: UploadFile = File(...)):
|
||||||
)
|
)
|
||||||
chunk_file_paths.append(None)
|
chunk_file_paths.append(None)
|
||||||
|
|
||||||
|
chunk_metadata = chunker._chunk_metadata if hasattr(chunker, '_chunk_metadata') else None
|
||||||
|
|
||||||
metadata = extract_metadata(
|
metadata = extract_metadata(
|
||||||
temp_path,
|
temp_path,
|
||||||
chunk_texts,
|
chunk_texts,
|
||||||
|
|
@ -112,6 +121,8 @@ async def ingest_document(file: UploadFile = File(...)):
|
||||||
page_numbers=page_numbers,
|
page_numbers=page_numbers,
|
||||||
chunk_file_paths=chunk_file_paths,
|
chunk_file_paths=chunk_file_paths,
|
||||||
document_id=document_id,
|
document_id=document_id,
|
||||||
|
strategy_type=strategy,
|
||||||
|
chunk_metadata=chunk_metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
rag.ingest_document(temp_path, chunk_texts, metadata, document_id=document_id)
|
rag.ingest_document(temp_path, chunk_texts, metadata, document_id=document_id)
|
||||||
|
|
@ -145,9 +156,12 @@ async def ingest_document(file: UploadFile = File(...)):
|
||||||
)
|
)
|
||||||
chunk_file_paths.append(None)
|
chunk_file_paths.append(None)
|
||||||
|
|
||||||
|
chunk_metadata = chunker._chunk_metadata if hasattr(chunker, '_chunk_metadata') else None
|
||||||
|
|
||||||
metadata = extract_metadata(
|
metadata = extract_metadata(
|
||||||
temp_path, chunks, original_filename=filename,
|
temp_path, chunks, original_filename=filename,
|
||||||
chunk_file_paths=chunk_file_paths, document_id=document_id,
|
chunk_file_paths=chunk_file_paths, document_id=document_id,
|
||||||
|
strategy_type=strategy, chunk_metadata=chunk_metadata,
|
||||||
)
|
)
|
||||||
rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)
|
rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)
|
||||||
|
|
||||||
|
|
@ -180,9 +194,12 @@ async def ingest_document(file: UploadFile = File(...)):
|
||||||
)
|
)
|
||||||
chunk_file_paths.append(None)
|
chunk_file_paths.append(None)
|
||||||
|
|
||||||
|
chunk_metadata = chunker._chunk_metadata if hasattr(chunker, '_chunk_metadata') else None
|
||||||
|
|
||||||
metadata = extract_metadata(
|
metadata = extract_metadata(
|
||||||
temp_path, chunks, original_filename=filename,
|
temp_path, chunks, original_filename=filename,
|
||||||
chunk_file_paths=chunk_file_paths, document_id=document_id,
|
chunk_file_paths=chunk_file_paths, document_id=document_id,
|
||||||
|
strategy_type=strategy, chunk_metadata=chunk_metadata,
|
||||||
)
|
)
|
||||||
rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)
|
rag.ingest_document(temp_path, chunks, metadata, document_id=document_id)
|
||||||
|
|
||||||
|
|
@ -193,6 +210,7 @@ async def ingest_document(file: UploadFile = File(...)):
|
||||||
document_id=document_id,
|
document_id=document_id,
|
||||||
chunk_count=chunk_count,
|
chunk_count=chunk_count,
|
||||||
filename=filename,
|
filename=filename,
|
||||||
|
strategy=strategy,
|
||||||
)
|
)
|
||||||
|
|
||||||
except HTTPException:
|
except HTTPException:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,209 @@
|
||||||
|
"""Phase 8 tests: Ingest API integration with strategy selection (Sub-Phase 8.3).
|
||||||
|
|
||||||
|
Covers:
|
||||||
|
- POST /api/v1/api/v1/ingest?strategy=token — existing behavior unchanged
|
||||||
|
- POST /api/v1/api/v1/ingest?strategy=question — Q&A chunking applied
|
||||||
|
- Invalid strategy values return 400
|
||||||
|
- IngestResponse includes strategy field
|
||||||
|
- DOCX with Q&A format uses question strategy
|
||||||
|
- Document without Q&A falls back gracefully
|
||||||
|
"""
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
from typing import List, Tuple
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi import FastAPI
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
from pypdf import PdfWriter
|
||||||
|
|
||||||
|
from app.routers.ingest import router
|
||||||
|
|
||||||
|
|
||||||
|
class _DeterministicEmbedding:
|
||||||
|
def name(self) -> str:
|
||||||
|
return "test_deterministic"
|
||||||
|
|
||||||
|
def __call__(self, input):
|
||||||
|
return self._embed(input)
|
||||||
|
|
||||||
|
def embed_query(self, input):
|
||||||
|
return self._embed(input)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _embed(texts):
|
||||||
|
vectors = []
|
||||||
|
for text in texts:
|
||||||
|
vec = [0.0] * 384
|
||||||
|
for i, ch in enumerate(text[:384]):
|
||||||
|
vec[i] = ord(ch) / 1000.0
|
||||||
|
vectors.append(vec)
|
||||||
|
return vectors
|
||||||
|
|
||||||
|
|
||||||
|
def _create_real_pdf(content: str) -> bytes:
|
||||||
|
writer = PdfWriter()
|
||||||
|
writer.add_blank_page(width=200, height=200)
|
||||||
|
buf = io.BytesIO()
|
||||||
|
writer.write(buf)
|
||||||
|
return buf.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
def _create_text_txt(content: str) -> bytes:
|
||||||
|
return content.encode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client(tmp_path, monkeypatch):
|
||||||
|
"""TestClient with real ChromaDB isolated in tmp_path + deterministic embeddings."""
|
||||||
|
chroma_path = str(tmp_path / "chroma_db")
|
||||||
|
chunk_path = str(tmp_path / "document_chunk")
|
||||||
|
prompts_path = str(tmp_path / "prompts.db")
|
||||||
|
history_path = str(tmp_path / "history.db")
|
||||||
|
|
||||||
|
monkeypatch.setenv("CHROMA_DB_PATH", chroma_path)
|
||||||
|
monkeypatch.setenv("DOCUMENT_CHUNK_PATH", chunk_path)
|
||||||
|
monkeypatch.setenv("PROMPTS_DB_PATH", prompts_path)
|
||||||
|
monkeypatch.setenv("HISTORY_DB_PATH", history_path)
|
||||||
|
monkeypatch.setenv("EMBEDDING_MODEL", "test-mock")
|
||||||
|
monkeypatch.setenv("LLM_API_KEY", "test-key")
|
||||||
|
|
||||||
|
from app.core.config import get_settings
|
||||||
|
get_settings.cache_clear()
|
||||||
|
from app.core.dependencies import get_settings_cached
|
||||||
|
get_settings_cached.cache_clear()
|
||||||
|
|
||||||
|
from app.core.sqlite_db import _get_db, init_prompts_db, init_history_db, seed_default_profiles
|
||||||
|
conn = _get_db(prompts_path)
|
||||||
|
init_prompts_db(conn)
|
||||||
|
seed_default_profiles(conn)
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
hconn = _get_db(history_path)
|
||||||
|
init_history_db(hconn)
|
||||||
|
hconn.close()
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"app.core.database.get_embedding_function_settings",
|
||||||
|
lambda settings: _DeterministicEmbedding(),
|
||||||
|
)
|
||||||
|
|
||||||
|
test_app = FastAPI()
|
||||||
|
test_app.include_router(router, prefix="/api/v1")
|
||||||
|
|
||||||
|
yield TestClient(test_app)
|
||||||
|
|
||||||
|
get_settings_cached.cache_clear()
|
||||||
|
get_settings.cache_clear()
|
||||||
|
|
||||||
|
|
||||||
|
def test_ingest_with_strategy_token(client):
|
||||||
|
"""Existing behavior unchanged: strategy=token uses TokenChunkingStrategy."""
|
||||||
|
txt_bytes = _create_text_txt("This is a test document with enough content to generate chunks.")
|
||||||
|
resp = client.post(
|
||||||
|
"/api/v1/ingest?strategy=token",
|
||||||
|
files={"file": ("test.txt", txt_bytes, "text/plain")},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["strategy"] == "token"
|
||||||
|
assert data["chunk_count"] > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_ingest_invalid_strategy_rejected(client):
|
||||||
|
"""Invalid strategy values return 400."""
|
||||||
|
txt_bytes = _create_text_txt("test")
|
||||||
|
resp = client.post(
|
||||||
|
"/api/v1/ingest?strategy=invalid",
|
||||||
|
files={"file": ("test.txt", txt_bytes, "text/plain")},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 400
|
||||||
|
assert "strategy" in resp.json()["detail"].lower()
|
||||||
|
|
||||||
|
|
||||||
|
def test_ingest_response_includes_strategy(client):
|
||||||
|
"""IngestResponse includes the strategy field."""
|
||||||
|
txt_bytes = _create_text_txt("Strategy response test content with more text to ensure chunks.")
|
||||||
|
resp = client.post(
|
||||||
|
"/api/v1/ingest?strategy=token",
|
||||||
|
files={"file": ("test.txt", txt_bytes, "text/plain")},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert "strategy" in resp.json()
|
||||||
|
|
||||||
|
|
||||||
|
def test_ingest_default_strategy_is_token(client):
|
||||||
|
"""When no strategy param provided, default to token."""
|
||||||
|
txt_bytes = _create_text_txt("Default strategy test with enough text to generate output.")
|
||||||
|
resp = client.post(
|
||||||
|
"/api/v1/ingest",
|
||||||
|
files={"file": ("test.txt", txt_bytes, "text/plain")},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["strategy"] == "token"
|
||||||
|
|
||||||
|
|
||||||
|
def test_ingest_question_strategy_txt(client, monkeypatch):
|
||||||
|
"""TXT with Q&A format uses question strategy and produces chunks."""
|
||||||
|
_mock_question_chunker(monkeypatch)
|
||||||
|
|
||||||
|
txt_bytes = _create_text_txt("問A1:test question\n答A1:test answer with more text here to ensure chunking works properly.")
|
||||||
|
|
||||||
|
resp = client.post(
|
||||||
|
"/api/v1/ingest?strategy=question",
|
||||||
|
files={"file": ("test.txt", txt_bytes, "text/plain")},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["strategy"] == "question"
|
||||||
|
assert data["chunk_count"] > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_ingest_question_strategy_no_qa_fallback(client, monkeypatch):
|
||||||
|
"""Document without Q&A markers falls back to narrative chunking without error."""
|
||||||
|
_mock_question_chunker(monkeypatch)
|
||||||
|
|
||||||
|
txt_bytes = _create_text_txt("This is plain text without any Q&A markers, but it needs to be long enough to generate at least one chunk when processed by the tokenizer.")
|
||||||
|
|
||||||
|
resp = client.post(
|
||||||
|
"/api/v1/ingest?strategy=question",
|
||||||
|
files={"file": ("plain.txt", txt_bytes, "text/plain")},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["strategy"] == "question"
|
||||||
|
assert data["chunk_count"] > 0
|
||||||
|
|
||||||
|
|
||||||
|
def _mock_question_chunker(monkeypatch):
|
||||||
|
"""Replace QuestionChunkingStrategy with a mock that returns test chunks."""
|
||||||
|
|
||||||
|
class _MockQuestionChunker:
|
||||||
|
def __init__(self, settings=None, llm_client=None):
|
||||||
|
self._chunk_metadata = [
|
||||||
|
{
|
||||||
|
"strategy_type": "question",
|
||||||
|
"section_type": "qa",
|
||||||
|
"question_index": 0,
|
||||||
|
"question_id": "A1",
|
||||||
|
"question_text": "What is X?",
|
||||||
|
"section_heading": "(A) Topic",
|
||||||
|
"answer_contains_table": False,
|
||||||
|
"source_page_range": [1, 2],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
self._max_tokens = 3000
|
||||||
|
|
||||||
|
def chunk(self, text):
|
||||||
|
self._chunk_metadata = self._chunk_metadata[:1]
|
||||||
|
return ["Question: What is X?\n\nAnswer: X is Y."]
|
||||||
|
|
||||||
|
def chunk_pages(self, pages, overlap_tokens=0):
|
||||||
|
self._chunk_metadata = self._chunk_metadata[:1]
|
||||||
|
return [("Question: What is X?\n\nAnswer: X is Y.", 1)]
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"app.utils.chunking.QuestionChunkingStrategy",
|
||||||
|
_MockQuestionChunker,
|
||||||
|
)
|
||||||
|
|
@ -0,0 +1,149 @@
|
||||||
|
"""Phase 8 tests: Q&A metadata enrichment (Sub-Phase 8.2).
|
||||||
|
|
||||||
|
Covers:
|
||||||
|
- Metadata enrichment with Q&A-specific fields via chunk_metadata param
|
||||||
|
- Backward compatibility: token strategy unchanged
|
||||||
|
- Page number references question location
|
||||||
|
- Chunk metadata merging with base metadata
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.utils.metadata import extract_metadata
|
||||||
|
|
||||||
|
|
||||||
|
def test_qa_metadata_fields(tmp_path):
|
||||||
|
"""strategy_type, question_index, question_id, question_text merged via chunk_metadata."""
|
||||||
|
file_path = tmp_path / "test.pdf"
|
||||||
|
file_path.write_text("dummy content")
|
||||||
|
|
||||||
|
chunks = ["chunk 1", "chunk 2"]
|
||||||
|
chunk_metadata = [
|
||||||
|
{
|
||||||
|
"strategy_type": "question",
|
||||||
|
"section_type": "qa",
|
||||||
|
"question_index": 0,
|
||||||
|
"question_id": "A1",
|
||||||
|
"question_text": "What is X?",
|
||||||
|
"section_heading": "(A) Section",
|
||||||
|
"answer_contains_table": True,
|
||||||
|
"source_page_range": [2, 5],
|
||||||
|
"parent_topic": "Topic Name",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"strategy_type": "question",
|
||||||
|
"section_type": "qa",
|
||||||
|
"question_index": 1,
|
||||||
|
"question_id": "A2",
|
||||||
|
"question_text": "What is Y?",
|
||||||
|
"section_heading": "(A) Section",
|
||||||
|
"answer_contains_table": False,
|
||||||
|
"source_page_range": [5, 7],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
metadata = extract_metadata(
|
||||||
|
file_path=str(file_path),
|
||||||
|
chunks=chunks,
|
||||||
|
strategy_type="question",
|
||||||
|
chunk_metadata=chunk_metadata,
|
||||||
|
)
|
||||||
|
assert len(metadata) == 2
|
||||||
|
|
||||||
|
m0 = metadata[0]
|
||||||
|
assert m0["strategy_type"] == "question"
|
||||||
|
assert m0["section_type"] == "qa"
|
||||||
|
assert m0["question_index"] == 0
|
||||||
|
assert m0["question_id"] == "A1"
|
||||||
|
assert m0["question_text"] == "What is X?"
|
||||||
|
assert m0["section_heading"] == "(A) Section"
|
||||||
|
assert m0["answer_contains_table"] is True
|
||||||
|
assert m0["source_page_range"] == [2, 5]
|
||||||
|
assert m0["parent_topic"] == "Topic Name"
|
||||||
|
|
||||||
|
m1 = metadata[1]
|
||||||
|
assert m1["question_index"] == 1
|
||||||
|
assert m1["question_id"] == "A2"
|
||||||
|
assert m1["answer_contains_table"] is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_qa_metadata_topic_section(tmp_path):
|
||||||
|
"""section_heading and parent_topic are both preserved."""
|
||||||
|
file_path = tmp_path / "test.pdf"
|
||||||
|
file_path.write_text("dummy content")
|
||||||
|
|
||||||
|
metadata = extract_metadata(
|
||||||
|
file_path=str(file_path),
|
||||||
|
chunks=["chunk"],
|
||||||
|
strategy_type="question",
|
||||||
|
chunk_metadata=[{"section_heading": "(B) Traffic", "parent_topic": "Traffic Planning"}],
|
||||||
|
)
|
||||||
|
assert metadata[0]["section_heading"] == "(B) Traffic"
|
||||||
|
assert metadata[0]["parent_topic"] == "Traffic Planning"
|
||||||
|
|
||||||
|
|
||||||
|
def test_token_metadata_unchanged(tmp_path):
|
||||||
|
"""Existing metadata fields unchanged for token strategy (no chunk_metadata)."""
|
||||||
|
file_path = tmp_path / "test.txt"
|
||||||
|
file_path.write_text("test content")
|
||||||
|
|
||||||
|
metadata = extract_metadata(
|
||||||
|
file_path=str(file_path),
|
||||||
|
chunks=["chunk 1", "chunk 2"],
|
||||||
|
original_filename="original.txt",
|
||||||
|
strategy_type="token",
|
||||||
|
)
|
||||||
|
assert len(metadata) == 2
|
||||||
|
for m in metadata:
|
||||||
|
assert "filename" in m
|
||||||
|
assert "upload_date" in m
|
||||||
|
assert "content_summary" in m
|
||||||
|
assert "chunk_index" in m
|
||||||
|
assert m.get("strategy_type", "token") == "token"
|
||||||
|
assert "question_id" not in m
|
||||||
|
|
||||||
|
|
||||||
|
def test_page_number_from_question(tmp_path):
|
||||||
|
"""Page ref should point to question location (pass via page_numbers from strategy)."""
|
||||||
|
file_path = tmp_path / "test.pdf"
|
||||||
|
file_path.write_text("dummy content")
|
||||||
|
|
||||||
|
metadata = extract_metadata(
|
||||||
|
file_path=str(file_path),
|
||||||
|
chunks=["question chunk"],
|
||||||
|
page_numbers=[3],
|
||||||
|
strategy_type="question",
|
||||||
|
chunk_metadata=[{
|
||||||
|
"question_id": "A1",
|
||||||
|
"source_page_range": [3, 8],
|
||||||
|
}],
|
||||||
|
)
|
||||||
|
assert metadata[0]["page_number"] == 3
|
||||||
|
assert metadata[0]["source_page_range"] == [3, 8]
|
||||||
|
|
||||||
|
|
||||||
|
def test_chunk_metadata_length_mismatch(tmp_path):
|
||||||
|
"""chunk_metadata length mismatch with chunks raises ValueError."""
|
||||||
|
file_path = tmp_path / "test.pdf"
|
||||||
|
file_path.write_text("dummy content")
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="chunk_metadata length"):
|
||||||
|
extract_metadata(
|
||||||
|
file_path=str(file_path),
|
||||||
|
chunks=["a", "b", "c"],
|
||||||
|
chunk_metadata=[{}, {}],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_chunk_metadata_empty_no_error(tmp_path):
|
||||||
|
"""Empty chunk_metadata list with matching chunks is valid."""
|
||||||
|
file_path = tmp_path / "test.pdf"
|
||||||
|
file_path.write_text("dummy content")
|
||||||
|
|
||||||
|
metadata = extract_metadata(
|
||||||
|
file_path=str(file_path),
|
||||||
|
chunks=["a"],
|
||||||
|
chunk_metadata=[],
|
||||||
|
)
|
||||||
|
assert len(metadata) == 1
|
||||||
|
|
@ -0,0 +1,481 @@
|
||||||
|
"""Phase 8 tests: Q&A-pair chunking strategy (Sub-Phase 8.1).
|
||||||
|
|
||||||
|
Covers:
|
||||||
|
- LLM structure detection response parsing (parse_llm_structure_response)
|
||||||
|
- Mixed format handling (問/答 + section headings)
|
||||||
|
- Narrative-only text (no Q&A format)
|
||||||
|
- Speaking notes (發言要點) chunking by bullet
|
||||||
|
- Regex fast-pass for Chinese 問/答 format
|
||||||
|
- Regex fast-pass for English Q1/Q2 format
|
||||||
|
- Multi-page section tracking with [PAGE_BREAK] markers
|
||||||
|
- ChunkingStrategy ABC compliance
|
||||||
|
- Page number references question (問) page, not answer
|
||||||
|
- Size limit: oversized sections recursively split with heading preserved
|
||||||
|
- build_chunks_from_sections output verification
|
||||||
|
- preprocess_text: footer stripping, colon normalization, page break insertion
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from typing import List, Tuple
|
||||||
|
from unittest.mock import AsyncMock, MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.utils.qa_chunking import (
|
||||||
|
Section,
|
||||||
|
preprocess_text,
|
||||||
|
build_structure_detection_prompt,
|
||||||
|
parse_llm_structure_response,
|
||||||
|
split_chinese_qa,
|
||||||
|
split_english_qa,
|
||||||
|
build_chunks_from_sections,
|
||||||
|
)
|
||||||
|
from app.utils.chunking import (
|
||||||
|
ChunkingStrategy,
|
||||||
|
QuestionChunkingStrategy,
|
||||||
|
get_chunking_strategy,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Fixtures
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_settings():
|
||||||
|
"""Minimal Settings mock with Q&A chunking defaults."""
|
||||||
|
s = MagicMock()
|
||||||
|
s.default_chunking_strategy = "question"
|
||||||
|
s.qa_vision_enabled = False
|
||||||
|
s.qa_max_chunk_tokens = 3000
|
||||||
|
s.qa_structure_model = ""
|
||||||
|
s.qa_include_internal_refs = True
|
||||||
|
s.qa_cache_vision_results = True
|
||||||
|
s.chunk_size = 1000
|
||||||
|
s.chunk_overlap = 200
|
||||||
|
s.llm_model_name = "test-model"
|
||||||
|
s.llm_api_key = "test-key"
|
||||||
|
s.llm_base_url = "https://example.com/v1"
|
||||||
|
s.llm_timeout = 30.0
|
||||||
|
s.llm_enable_thinking = False
|
||||||
|
s.vllm_engine = False
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
SAMPLE_LLM_RESPONSE = json.dumps({
|
||||||
|
"sections": [
|
||||||
|
{
|
||||||
|
"type": "qa",
|
||||||
|
"heading": "(A) 排水系統",
|
||||||
|
"qa_id": "A1",
|
||||||
|
"question": "古洞北和粉嶺北新發展區的設計是否能抵禦氣候變化?",
|
||||||
|
"answer": "研究顧問已為古洞北和粉嶺北新發展區研究範圍進行了評估。",
|
||||||
|
"start_page": 2,
|
||||||
|
"end_page": 3,
|
||||||
|
"has_table": False,
|
||||||
|
"parent_topic": "排水系統",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "narrative",
|
||||||
|
"heading": "(1) 住戶的安置補償",
|
||||||
|
"content": "合資格住戶可選擇(i)「須通過經濟狀況審查」安置安排。",
|
||||||
|
"start_page": 2,
|
||||||
|
"end_page": 5,
|
||||||
|
"has_table": False,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "speaking_notes",
|
||||||
|
"heading": "發言要點",
|
||||||
|
"content": "⚫ 古洞北/粉嶺北新發展區是北部都會區內的新市鎮發展項目\n⚫ 第一期發展預計於2030年完成",
|
||||||
|
"start_page": 1,
|
||||||
|
"end_page": 2,
|
||||||
|
"has_table": False,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Test: LLM structure detection parsing
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestLLMStructureDetection:
|
||||||
|
|
||||||
|
def test_llm_structure_detection(self):
|
||||||
|
"""parse_llm_structure_response correctly parses LLM JSON with qa/narrative/speaking_notes."""
|
||||||
|
sections = parse_llm_structure_response(SAMPLE_LLM_RESPONSE)
|
||||||
|
assert len(sections) == 3
|
||||||
|
|
||||||
|
qa = sections[0]
|
||||||
|
assert qa.type == "qa"
|
||||||
|
assert qa.qa_id == "A1"
|
||||||
|
assert qa.question == "古洞北和粉嶺北新發展區的設計是否能抵禦氣候變化?"
|
||||||
|
assert qa.answer == "研究顧問已為古洞北和粉嶺北新發展區研究範圍進行了評估。"
|
||||||
|
assert qa.start_page == 2
|
||||||
|
assert qa.end_page == 3
|
||||||
|
assert qa.heading == "(A) 排水系統"
|
||||||
|
assert qa.parent_topic == "排水系統"
|
||||||
|
|
||||||
|
narr = sections[1]
|
||||||
|
assert narr.type == "narrative"
|
||||||
|
assert narr.heading == "(1) 住戶的安置補償"
|
||||||
|
assert "合資格住戶" in narr.content
|
||||||
|
|
||||||
|
notes = sections[2]
|
||||||
|
assert notes.type == "speaking_notes"
|
||||||
|
assert "⚫" in notes.content
|
||||||
|
|
||||||
|
def test_llm_handles_mixed_formats(self):
|
||||||
|
"""Document with 問/答 markers + section headings correctly classified."""
|
||||||
|
mixed_json = json.dumps({
|
||||||
|
"sections": [
|
||||||
|
{
|
||||||
|
"type": "qa",
|
||||||
|
"heading": "(B) 交通",
|
||||||
|
"qa_id": "B1",
|
||||||
|
"question": "新建道路何時通車?",
|
||||||
|
"answer": "預計2027年通車。",
|
||||||
|
"start_page": 3,
|
||||||
|
"end_page": 4,
|
||||||
|
"has_table": False,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "narrative",
|
||||||
|
"heading": "背景",
|
||||||
|
"content": "本文件說明交通規劃。",
|
||||||
|
"start_page": 1,
|
||||||
|
"end_page": 2,
|
||||||
|
"has_table": False,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
})
|
||||||
|
sections = parse_llm_structure_response(mixed_json)
|
||||||
|
assert len(sections) == 2
|
||||||
|
assert sections[0].type == "qa"
|
||||||
|
assert sections[1].type == "narrative"
|
||||||
|
|
||||||
|
def test_llm_handles_no_qa_format(self):
|
||||||
|
"""Narrative-only text (like File L pages 1-13) produces only narrative sections."""
|
||||||
|
narrative_json = json.dumps({
|
||||||
|
"sections": [
|
||||||
|
{
|
||||||
|
"type": "narrative",
|
||||||
|
"heading": "Introduction",
|
||||||
|
"content": "This document provides background on policy matters.",
|
||||||
|
"start_page": 1,
|
||||||
|
"end_page": 5,
|
||||||
|
"has_table": False,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "narrative",
|
||||||
|
"heading": "Analysis",
|
||||||
|
"content": "The analysis covers multiple dimensions.",
|
||||||
|
"start_page": 5,
|
||||||
|
"end_page": 13,
|
||||||
|
"has_table": False,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
})
|
||||||
|
sections = parse_llm_structure_response(narrative_json)
|
||||||
|
assert len(sections) == 2
|
||||||
|
assert all(s.type == "narrative" for s in sections)
|
||||||
|
|
||||||
|
def test_llm_handles_speaking_notes(self):
|
||||||
|
"""發言要點 text with bullet points produces speaking_notes sections."""
|
||||||
|
notes_json = json.dumps({
|
||||||
|
"sections": [
|
||||||
|
{
|
||||||
|
"type": "speaking_notes",
|
||||||
|
"heading": "發言要點",
|
||||||
|
"content": "⚫ 要點一:政策方向\n⚫ 要點二:實施計劃\n⚫ 要點三:預算安排",
|
||||||
|
"start_page": 1,
|
||||||
|
"end_page": 2,
|
||||||
|
"has_table": False,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
})
|
||||||
|
sections = parse_llm_structure_response(notes_json)
|
||||||
|
assert len(sections) == 1
|
||||||
|
assert sections[0].type == "speaking_notes"
|
||||||
|
assert sections[0].content.count("⚫") == 3
|
||||||
|
|
||||||
|
def test_parse_markdown_fenced_json(self):
|
||||||
|
"""parse_llm_structure_response handles ```json ... ``` wrapped responses."""
|
||||||
|
fenced = '```json\n' + SAMPLE_LLM_RESPONSE + '\n```'
|
||||||
|
sections = parse_llm_structure_response(fenced)
|
||||||
|
assert len(sections) == 3
|
||||||
|
|
||||||
|
def test_parse_invalid_json_raises(self):
|
||||||
|
"""parse_llm_structure_response raises ValueError on non-JSON input."""
|
||||||
|
with pytest.raises(ValueError, match="Invalid JSON"):
|
||||||
|
parse_llm_structure_response("this is not json")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Test: Regex fast-pass
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestRegexFastPass:
|
||||||
|
|
||||||
|
def test_regex_fastpass_chinese(self):
|
||||||
|
"""Text with 問B1/答B1 markers detected by split_chinese_qa without LLM."""
|
||||||
|
text = (
|
||||||
|
"(A) 排水系統\n"
|
||||||
|
"問 B1:古洞北的設計是否能抵禦氣候變化?\n"
|
||||||
|
"答 B1:研究顧問已為古洞北新發展區進行了評估。\n"
|
||||||
|
"問 B2:第二個問題是什麼?\n"
|
||||||
|
"答 B2:這是第二個問題的答案。\n"
|
||||||
|
)
|
||||||
|
sections = split_chinese_qa(text)
|
||||||
|
assert len(sections) >= 2
|
||||||
|
# All should be QA type
|
||||||
|
assert all(s.type == "qa" for s in sections)
|
||||||
|
# First should have question containing 古洞北
|
||||||
|
assert "古洞北" in sections[0].question
|
||||||
|
|
||||||
|
def test_regex_fastpass_chinese_no_match(self):
|
||||||
|
"""split_chinese_qa returns empty list when no markers found."""
|
||||||
|
text = "This is plain text without any Q&A markers."
|
||||||
|
assert split_chinese_qa(text) == []
|
||||||
|
|
||||||
|
def test_regex_fastpass_english(self):
|
||||||
|
"""Text with Q1, Q2 markers detected by split_english_qa without LLM."""
|
||||||
|
text = (
|
||||||
|
"Background information here.\n\n"
|
||||||
|
"Q1 What is the timeline for the project?\n"
|
||||||
|
"The project is expected to complete by 2027.\n"
|
||||||
|
"Q2 How much will it cost?\n"
|
||||||
|
"The estimated cost is HK$500 million.\n"
|
||||||
|
)
|
||||||
|
sections = split_english_qa(text)
|
||||||
|
assert len(sections) >= 2
|
||||||
|
assert all(s.type == "qa" for s in sections)
|
||||||
|
assert any("timeline" in (s.question or "").lower() for s in sections)
|
||||||
|
|
||||||
|
def test_regex_fastpass_english_no_match(self):
|
||||||
|
"""split_english_qa returns empty list when no markers found."""
|
||||||
|
text = "純中文文本沒有英文問答標記。"
|
||||||
|
assert split_english_qa(text) == []
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Test: Multi-page tracking
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestMultiPage:
|
||||||
|
|
||||||
|
def test_multi_page_sections(self):
|
||||||
|
"""Sections with [PAGE_BREAK: N] markers spanning pages track correctly."""
|
||||||
|
pages = [
|
||||||
|
(1, "Header line\n(A) Water drainage\nSome intro text"),
|
||||||
|
(2, "More drainage info\nFooter text X-1"),
|
||||||
|
(3, "New section begins\n(B) Traffic planning"),
|
||||||
|
]
|
||||||
|
text = preprocess_text(pages)
|
||||||
|
# Should have page break markers
|
||||||
|
assert "[PAGE_BREAK: 1]" in text
|
||||||
|
assert "[PAGE_BREAK: 2]" in text
|
||||||
|
assert "[PAGE_BREAK: 3]" in text
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Test: ABC contract
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestABCContract:
|
||||||
|
|
||||||
|
def test_abc_contract(self):
|
||||||
|
"""QuestionChunkingStrategy satisfies ChunkingStrategy ABC."""
|
||||||
|
mock_settings = MagicMock()
|
||||||
|
mock_settings.qa_max_chunk_tokens = 3000
|
||||||
|
mock_settings.qa_include_internal_refs = True
|
||||||
|
strategy = QuestionChunkingStrategy(settings=mock_settings)
|
||||||
|
assert isinstance(strategy, ChunkingStrategy)
|
||||||
|
|
||||||
|
def test_get_chunking_strategy_factory(self, mock_settings):
|
||||||
|
"""get_chunking_strategy returns correct strategy type."""
|
||||||
|
token_strat = get_chunking_strategy("token", mock_settings)
|
||||||
|
assert isinstance(token_strat, ChunkingStrategy)
|
||||||
|
|
||||||
|
q_strat = get_chunking_strategy("question", mock_settings)
|
||||||
|
assert isinstance(q_strat, QuestionChunkingStrategy)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Test: Page number reference
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestPageNumberReference:
|
||||||
|
|
||||||
|
def test_page_number_reference_question(self):
|
||||||
|
"""Page ref in metadata points to question (問) page, not answer page."""
|
||||||
|
sections = [
|
||||||
|
Section(
|
||||||
|
type="qa",
|
||||||
|
heading="(A) Topic",
|
||||||
|
qa_id="A1",
|
||||||
|
question="What is X?",
|
||||||
|
answer="X is Y.",
|
||||||
|
start_page=5,
|
||||||
|
end_page=8,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
chunks = build_chunks_from_sections(sections)
|
||||||
|
assert len(chunks) == 1
|
||||||
|
chunk_text, page_num, metadata = chunks[0]
|
||||||
|
# Page number should be start_page (question location)
|
||||||
|
assert page_num == 5
|
||||||
|
assert metadata.get("source_page_range") == [5, 8]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Test: Size limit recursive split
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestSizeLimit:
|
||||||
|
|
||||||
|
def test_size_limit(self):
|
||||||
|
"""Oversized QA section > 3000 tokens gets recursively split with question prepended."""
|
||||||
|
# Create a QA pair with a very long answer
|
||||||
|
long_answer = "\n\n".join(f"Paragraph {i}: " + "x" * 200 for i in range(80))
|
||||||
|
sections = [
|
||||||
|
Section(
|
||||||
|
type="qa",
|
||||||
|
heading="(A) Topic",
|
||||||
|
qa_id="A1",
|
||||||
|
question="What is the detailed plan?",
|
||||||
|
answer=long_answer,
|
||||||
|
start_page=2,
|
||||||
|
end_page=5,
|
||||||
|
has_table=False,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
# Use a small max_tokens to force splitting
|
||||||
|
chunks = build_chunks_from_sections(sections, max_tokens=500)
|
||||||
|
assert len(chunks) > 1
|
||||||
|
# Each chunk should have the question text prepended
|
||||||
|
for chunk_text, page_num, metadata in chunks:
|
||||||
|
assert "What is the detailed plan?" in chunk_text
|
||||||
|
# Page number should always be the question page
|
||||||
|
assert page_num == 2
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Test: build_chunks_from_sections
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestBuildChunksFromSections:
|
||||||
|
|
||||||
|
def test_build_chunks_from_sections(self):
|
||||||
|
"""Verify chunk texts and metadata from sections list."""
|
||||||
|
sections = [
|
||||||
|
Section(
|
||||||
|
type="qa",
|
||||||
|
heading="(A) 排水系統",
|
||||||
|
qa_id="A1",
|
||||||
|
question="古洞北的設計是否能抵禦氣候變化?",
|
||||||
|
answer="研究顧問已為古洞北進行了評估。",
|
||||||
|
start_page=2,
|
||||||
|
end_page=3,
|
||||||
|
has_table=True,
|
||||||
|
parent_topic="排水系統",
|
||||||
|
),
|
||||||
|
Section(
|
||||||
|
type="narrative",
|
||||||
|
heading="(1) 住戶的安置補償",
|
||||||
|
content="合資格住戶可選擇安置安排。",
|
||||||
|
start_page=3,
|
||||||
|
end_page=5,
|
||||||
|
has_table=False,
|
||||||
|
),
|
||||||
|
Section(
|
||||||
|
type="speaking_notes",
|
||||||
|
heading="發言要點",
|
||||||
|
content="⚫ 要點一:政策方向\n⚫ 要點二:實施計劃",
|
||||||
|
start_page=1,
|
||||||
|
end_page=1,
|
||||||
|
has_table=False,
|
||||||
|
),
|
||||||
|
Section(
|
||||||
|
type="toc",
|
||||||
|
heading="目錄",
|
||||||
|
content="Page 1 ... Page 2",
|
||||||
|
start_page=1,
|
||||||
|
end_page=1,
|
||||||
|
has_table=False,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
chunks = build_chunks_from_sections(sections)
|
||||||
|
# Should have: 1 QA + 1 narrative + 2 speaking_notes bullets + 0 toc = 4
|
||||||
|
assert len(chunks) >= 4
|
||||||
|
|
||||||
|
# First chunk: QA
|
||||||
|
qa_text, qa_page, qa_meta = chunks[0]
|
||||||
|
assert "古洞北" in qa_text
|
||||||
|
assert qa_page == 2
|
||||||
|
assert qa_meta["section_type"] == "qa"
|
||||||
|
assert qa_meta["question_id"] == "A1"
|
||||||
|
assert qa_meta["question_index"] == 0
|
||||||
|
assert qa_meta["answer_contains_table"] is True
|
||||||
|
assert qa_meta["section_heading"] == "(A) 排水系統"
|
||||||
|
|
||||||
|
# Find the narrative chunk
|
||||||
|
narr_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "narrative"]
|
||||||
|
assert len(narr_chunks) == 1
|
||||||
|
narr_text, narr_page, narr_meta = narr_chunks[0]
|
||||||
|
assert "住戶的安置補償" in narr_text
|
||||||
|
assert "合資格住戶" in narr_text
|
||||||
|
|
||||||
|
# Find speaking_notes chunks
|
||||||
|
notes_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "speaking_notes"]
|
||||||
|
assert len(notes_chunks) == 2
|
||||||
|
for t, p, m in notes_chunks:
|
||||||
|
assert "要點" in t
|
||||||
|
|
||||||
|
# No TOC chunks
|
||||||
|
toc_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "toc"]
|
||||||
|
assert len(toc_chunks) == 0
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Test: preprocess_text
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestPreprocessText:
|
||||||
|
|
||||||
|
def test_preprocess_text(self):
|
||||||
|
"""Footer markers stripped, colons normalized, page breaks inserted."""
|
||||||
|
pages = [
|
||||||
|
(1, "Header\n(A) Section Title\nX-1\n2024-01-15"),
|
||||||
|
(2, "Content with:fullwidth colon\nMore text:here"),
|
||||||
|
]
|
||||||
|
result = preprocess_text(pages)
|
||||||
|
|
||||||
|
# Should have page break markers
|
||||||
|
assert "[PAGE_BREAK: 1]" in result
|
||||||
|
assert "[PAGE_BREAK: 2]" in result
|
||||||
|
|
||||||
|
# Fullwidth colons normalized to ASCII
|
||||||
|
assert ":" not in result
|
||||||
|
assert ":" in result
|
||||||
|
|
||||||
|
# Page footer patterns should be stripped (X-1, dates like 2024-01-15)
|
||||||
|
assert "X-1" not in result
|
||||||
|
assert "2024-01-15" not in result
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Test: build_structure_detection_prompt
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestBuildPrompt:
|
||||||
|
|
||||||
|
def test_build_structure_detection_prompt(self):
|
||||||
|
"""Prompt contains key instructions for LLM classification."""
|
||||||
|
text = "Sample document text [PAGE_BREAK: 1]"
|
||||||
|
prompt = build_structure_detection_prompt(text)
|
||||||
|
assert "Hong Kong Legislative Council" in prompt or "Legislative Council" in prompt
|
||||||
|
assert "qa" in prompt.lower() or "問" in prompt
|
||||||
|
assert "narrative" in prompt.lower()
|
||||||
|
assert "speaking_notes" in prompt.lower() or "speaking notes" in prompt.lower()
|
||||||
|
assert text in prompt
|
||||||
|
|
@ -6,8 +6,15 @@ token-based windows.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import List, Tuple
|
from typing import TYPE_CHECKING, List, Optional, Tuple
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from app.core.config import Settings
|
||||||
|
from app.services.llm_client import LLMClient
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class ChunkingStrategy(ABC):
|
class ChunkingStrategy(ABC):
|
||||||
|
|
@ -117,3 +124,111 @@ class TokenChunkingStrategy(ChunkingStrategy):
|
||||||
results.append(("\n".join(parts), page_num))
|
results.append(("\n".join(parts), page_num))
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
class QuestionChunkingStrategy(ChunkingStrategy):
|
||||||
|
"""Chunk text by detecting Q&A structure using LLM and/or regex patterns.
|
||||||
|
|
||||||
|
Designed for LegCo documents with explicit 問/答 or Q1/Q2 markers.
|
||||||
|
Falls back to section-based chunking for narrative-only documents.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
settings: "Settings",
|
||||||
|
llm_client: Optional["LLMClient"] = None,
|
||||||
|
):
|
||||||
|
self._settings = settings
|
||||||
|
self._llm_client = llm_client
|
||||||
|
self._max_tokens = getattr(settings, "qa_max_chunk_tokens", 3000)
|
||||||
|
self._chunk_metadata: List[dict] = []
|
||||||
|
|
||||||
|
def chunk(self, text: str) -> List[str]:
|
||||||
|
"""Split text into chunks using Q&A detection (for DOCX/TXT)."""
|
||||||
|
if not text or not text.strip():
|
||||||
|
return []
|
||||||
|
|
||||||
|
from app.utils.qa_chunking import (
|
||||||
|
split_chinese_qa,
|
||||||
|
split_english_qa,
|
||||||
|
build_chunks_from_sections,
|
||||||
|
Section,
|
||||||
|
)
|
||||||
|
|
||||||
|
sections = split_chinese_qa(text)
|
||||||
|
if not sections:
|
||||||
|
sections = split_english_qa(text)
|
||||||
|
|
||||||
|
if not sections:
|
||||||
|
sections = [Section(type="narrative", content=text, start_page=1, end_page=1)]
|
||||||
|
|
||||||
|
results = build_chunks_from_sections(sections, max_tokens=self._max_tokens)
|
||||||
|
self._chunk_metadata = [meta for _, _, meta in results]
|
||||||
|
return [chunk_text for chunk_text, _, _ in results]
|
||||||
|
|
||||||
|
def chunk_pages(
|
||||||
|
self, pages: List[Tuple[int, str]], overlap_tokens: int = 0
|
||||||
|
) -> List[Tuple[str, int]]:
|
||||||
|
"""Split page-segmented text using Q&A detection (for PDF).
|
||||||
|
|
||||||
|
Returns list of (chunk_text, page_number) where page_number
|
||||||
|
references the question location for Q&A chunks.
|
||||||
|
"""
|
||||||
|
if not pages:
|
||||||
|
return []
|
||||||
|
|
||||||
|
from app.utils.qa_chunking import (
|
||||||
|
preprocess_text,
|
||||||
|
split_chinese_qa,
|
||||||
|
split_english_qa,
|
||||||
|
build_chunks_from_sections,
|
||||||
|
parse_llm_structure_response,
|
||||||
|
build_structure_detection_prompt,
|
||||||
|
Section,
|
||||||
|
)
|
||||||
|
|
||||||
|
full_text = preprocess_text(pages)
|
||||||
|
|
||||||
|
sections = split_chinese_qa(full_text)
|
||||||
|
if not sections:
|
||||||
|
sections = split_english_qa(full_text)
|
||||||
|
|
||||||
|
if not sections and self._llm_client is not None:
|
||||||
|
import asyncio
|
||||||
|
prompt = build_structure_detection_prompt(full_text)
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
if loop.is_running():
|
||||||
|
sections = []
|
||||||
|
else:
|
||||||
|
response = loop.run_until_complete(
|
||||||
|
self._llm_client.complete(prompt, temperature=0.3, step_name="StructureDetection")
|
||||||
|
)
|
||||||
|
sections = parse_llm_structure_response(response)
|
||||||
|
except Exception:
|
||||||
|
logger.warning("LLM structure detection failed, using fallback", exc_info=True)
|
||||||
|
|
||||||
|
if not sections:
|
||||||
|
sections = [Section(type="narrative", content=full_text, start_page=1, end_page=len(pages))]
|
||||||
|
|
||||||
|
results = build_chunks_from_sections(sections, max_tokens=self._max_tokens)
|
||||||
|
self._chunk_metadata = [meta for _, _, meta in results]
|
||||||
|
return [(chunk_text, page_num) for chunk_text, page_num, _ in results]
|
||||||
|
|
||||||
|
|
||||||
|
def get_chunking_strategy(name: str, settings: "Settings") -> ChunkingStrategy:
|
||||||
|
"""Factory: return the named chunking strategy.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: "token" or "question"
|
||||||
|
settings: Application settings instance.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ChunkingStrategy instance.
|
||||||
|
"""
|
||||||
|
if name == "question":
|
||||||
|
return QuestionChunkingStrategy(settings=settings)
|
||||||
|
return TokenChunkingStrategy(
|
||||||
|
chunk_size=settings.chunk_size,
|
||||||
|
overlap=settings.chunk_overlap,
|
||||||
|
)
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,8 @@ def extract_metadata(
|
||||||
page_numbers: List[int | None] | None = None,
|
page_numbers: List[int | None] | None = None,
|
||||||
chunk_file_paths: List[str | None] | None = None,
|
chunk_file_paths: List[str | None] | None = None,
|
||||||
document_id: str | None = None,
|
document_id: str | None = None,
|
||||||
|
strategy_type: str = "token",
|
||||||
|
chunk_metadata: List[Dict[str, Any]] | None = None,
|
||||||
) -> List[Dict[str, Any]]:
|
) -> List[Dict[str, Any]]:
|
||||||
"""Extract metadata for a list of text chunks.
|
"""Extract metadata for a list of text chunks.
|
||||||
|
|
||||||
|
|
@ -23,6 +25,10 @@ def extract_metadata(
|
||||||
- chunk_file_path: path to the per-chunk source file
|
- chunk_file_path: path to the per-chunk source file
|
||||||
- document_id: unique identifier linking all chunks to the same document
|
- document_id: unique identifier linking all chunks to the same document
|
||||||
|
|
||||||
|
Package 8 Q&A fields (present when chunk_metadata provided):
|
||||||
|
- strategy_type, section_type, question_index, question_id, question_text,
|
||||||
|
section_heading, answer_contains_table, source_page_range, parent_topic
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Path to the file associated with the chunks.
|
file_path: Path to the file associated with the chunks.
|
||||||
chunks: List of string chunks to generate metadata for.
|
chunks: List of string chunks to generate metadata for.
|
||||||
|
|
@ -31,6 +37,12 @@ def extract_metadata(
|
||||||
page_numbers: Optional per-chunk page numbers. Length must match chunks.
|
page_numbers: Optional per-chunk page numbers. Length must match chunks.
|
||||||
chunk_file_paths: Optional per-chunk source file paths. Length must match chunks.
|
chunk_file_paths: Optional per-chunk source file paths. Length must match chunks.
|
||||||
document_id: Optional unique document identifier applied to all chunks.
|
document_id: Optional unique document identifier applied to all chunks.
|
||||||
|
strategy_type: Chunking strategy used ("token" or "question"). Stored in
|
||||||
|
each chunk's metadata.
|
||||||
|
chunk_metadata: Optional per-chunk metadata dicts from Q&A strategy.
|
||||||
|
Each dict is merged into the corresponding base metadata entry.
|
||||||
|
Length must match chunks. Fields like question_id, question_index,
|
||||||
|
section_type, etc. are forwarded to ChromaDB metadata.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of metadata dictionaries, one per chunk. Empty list if chunks is empty.
|
A list of metadata dictionaries, one per chunk. Empty list if chunks is empty.
|
||||||
|
|
@ -55,6 +67,11 @@ def extract_metadata(
|
||||||
f"chunk_file_paths length ({len(chunk_file_paths)}) does not match chunks length ({len(chunks)})"
|
f"chunk_file_paths length ({len(chunk_file_paths)}) does not match chunks length ({len(chunks)})"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if chunk_metadata is not None and len(chunk_metadata) > 0 and len(chunk_metadata) != len(chunks):
|
||||||
|
raise ValueError(
|
||||||
|
f"chunk_metadata length ({len(chunk_metadata)}) does not match chunks length ({len(chunks)})"
|
||||||
|
)
|
||||||
|
|
||||||
filename = original_filename if original_filename else os.path.basename(file_path)
|
filename = original_filename if original_filename else os.path.basename(file_path)
|
||||||
upload_date = datetime.now().isoformat()
|
upload_date = datetime.now().isoformat()
|
||||||
|
|
||||||
|
|
@ -68,6 +85,7 @@ def extract_metadata(
|
||||||
"content_summary": content_summary,
|
"content_summary": content_summary,
|
||||||
"chunk_index": idx,
|
"chunk_index": idx,
|
||||||
"document_id": document_id,
|
"document_id": document_id,
|
||||||
|
"strategy_type": strategy_type,
|
||||||
}
|
}
|
||||||
page_num = page_numbers[idx] if page_numbers else None
|
page_num = page_numbers[idx] if page_numbers else None
|
||||||
if page_num is not None:
|
if page_num is not None:
|
||||||
|
|
@ -75,6 +93,8 @@ def extract_metadata(
|
||||||
cfp = chunk_file_paths[idx] if chunk_file_paths else None
|
cfp = chunk_file_paths[idx] if chunk_file_paths else None
|
||||||
if cfp is not None:
|
if cfp is not None:
|
||||||
entry["chunk_file_path"] = cfp
|
entry["chunk_file_path"] = cfp
|
||||||
|
if chunk_metadata:
|
||||||
|
entry.update(chunk_metadata[idx])
|
||||||
metadata.append(entry)
|
metadata.append(entry)
|
||||||
|
|
||||||
return metadata
|
return metadata
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,361 @@
|
||||||
|
"""Q&A-pair chunking utilities for Package 8.
|
||||||
|
|
||||||
|
Provides section detection (LLM + regex), text preprocessing,
|
||||||
|
and chunk building for LegCo documents with Q&A structure.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Section:
|
||||||
|
"""A detected section within a LegCo document."""
|
||||||
|
type: str # "qa" | "narrative" | "speaking_notes" | "table" | "toc" | "heading_only"
|
||||||
|
heading: str = ""
|
||||||
|
qa_id: Optional[str] = None
|
||||||
|
question: Optional[str] = None
|
||||||
|
answer: Optional[str] = None
|
||||||
|
content: str = ""
|
||||||
|
start_page: int = 1
|
||||||
|
end_page: int = 1
|
||||||
|
has_table: bool = False
|
||||||
|
parent_topic: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
_FOOTER_RE = re.compile(r"^[A-Z]-\d+\s*$", re.MULTILINE)
|
||||||
|
_FOOTER_DATE_RE = re.compile(r"^[A-Z]-\d+\s*\n\d{4}-\d{2}-\d{2}$", re.MULTILINE)
|
||||||
|
_HEADER_LETTER_RE = re.compile(r"^(\([A-Z]\))\s*$", re.MULTILINE)
|
||||||
|
_FULLWIDTH_COLON_RE = re.compile("[︰:]")
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_text(pages: List[Tuple[int, str]]) -> str:
|
||||||
|
"""Concatenate pages, strip footers/headers, normalize colons, insert [PAGE_BREAK: N] markers."""
|
||||||
|
parts: List[str] = []
|
||||||
|
for idx, (page_num, page_text) in enumerate(pages):
|
||||||
|
text = _FOOTER_DATE_RE.sub("", page_text)
|
||||||
|
text = _FOOTER_RE.sub("", text)
|
||||||
|
if idx > 0:
|
||||||
|
text = _HEADER_LETTER_RE.sub("", text)
|
||||||
|
text = _FULLWIDTH_COLON_RE.sub(":", text)
|
||||||
|
parts.append(f"[PAGE_BREAK: {page_num}]\n{text}")
|
||||||
|
return "\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
_STRUCTURE_PROMPT_TEMPLATE = """You are analyzing a Hong Kong Legislative Council document.
|
||||||
|
The text has page markers like [PAGE_BREAK: N] showing where pages begin.
|
||||||
|
|
||||||
|
For each distinct section in this document, identify:
|
||||||
|
1. The section type:
|
||||||
|
- "qa": a question-and-answer pair (問/答 or Q1/Q2 format)
|
||||||
|
- "narrative": policy text, explanatory paragraphs, section content with bullets
|
||||||
|
- "speaking_notes": briefing points (發言要點) with bullet markers
|
||||||
|
- "table": standalone data tables (not embedded in answers)
|
||||||
|
- "toc": table of contents
|
||||||
|
- "heading_only": a section heading with no following content
|
||||||
|
|
||||||
|
2. For "qa" sections:
|
||||||
|
- The question text (exact)
|
||||||
|
- The answer text (exact, including tables, bullet lists, and [內部參考] content)
|
||||||
|
- The question ID if present (e.g. "A1", "Q3")
|
||||||
|
- The start page and end page
|
||||||
|
|
||||||
|
3. For all sections:
|
||||||
|
- The section heading (e.g. "(A) 排水系統", "(1) 住戶的安置補償")
|
||||||
|
- The start page and end page
|
||||||
|
- Whether the section contains tables
|
||||||
|
|
||||||
|
Return JSON:
|
||||||
|
{{
|
||||||
|
"sections": [
|
||||||
|
{{
|
||||||
|
"type": "qa",
|
||||||
|
"heading": "(A) 排水系統",
|
||||||
|
"qa_id": "A1",
|
||||||
|
"question": "...",
|
||||||
|
"answer": "...",
|
||||||
|
"start_page": 2,
|
||||||
|
"end_page": 3,
|
||||||
|
"has_table": true,
|
||||||
|
"parent_topic": "排水系統"
|
||||||
|
}},
|
||||||
|
{{
|
||||||
|
"type": "narrative",
|
||||||
|
"heading": "(1) 住戶的安置補償",
|
||||||
|
"content": "...",
|
||||||
|
"start_page": 2,
|
||||||
|
"end_page": 5,
|
||||||
|
"has_table": false
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}}
|
||||||
|
|
||||||
|
DOCUMENT TEXT:
|
||||||
|
{document_text}"""
|
||||||
|
|
||||||
|
|
||||||
|
def build_structure_detection_prompt(text: str) -> str:
|
||||||
|
"""Construct the LLM prompt for section classification."""
|
||||||
|
return _STRUCTURE_PROMPT_TEMPLATE.format(document_text=text)
|
||||||
|
|
||||||
|
|
||||||
|
_MARKDOWN_FENCE_RE = re.compile(r"```(?:json)?\s*\n?(.*?)\n?```", re.DOTALL)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_llm_structure_response(response_text: str) -> List[Section]:
|
||||||
|
"""Parse the JSON returned by the LLM. Handle markdown code fences.
|
||||||
|
|
||||||
|
Raises ValueError if response is not valid JSON.
|
||||||
|
"""
|
||||||
|
cleaned = response_text.strip()
|
||||||
|
fence_match = _MARKDOWN_FENCE_RE.search(cleaned)
|
||||||
|
if fence_match:
|
||||||
|
cleaned = fence_match.group(1).strip()
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(cleaned)
|
||||||
|
except json.JSONDecodeError as exc:
|
||||||
|
raise ValueError(f"Invalid JSON from LLM structure detection: {exc}") from exc
|
||||||
|
|
||||||
|
sections_raw = data.get("sections", [])
|
||||||
|
sections: List[Section] = []
|
||||||
|
for raw in sections_raw:
|
||||||
|
sections.append(Section(
|
||||||
|
type=raw.get("type", "narrative"),
|
||||||
|
heading=raw.get("heading", ""),
|
||||||
|
qa_id=raw.get("qa_id"),
|
||||||
|
question=raw.get("question"),
|
||||||
|
answer=raw.get("answer"),
|
||||||
|
content=raw.get("content", ""),
|
||||||
|
start_page=raw.get("start_page", 1),
|
||||||
|
end_page=raw.get("end_page", 1),
|
||||||
|
has_table=raw.get("has_table", False),
|
||||||
|
parent_topic=raw.get("parent_topic", ""),
|
||||||
|
))
|
||||||
|
return sections
|
||||||
|
|
||||||
|
|
||||||
|
_CN_QA_RE = re.compile(
|
||||||
|
r"問\s*([A-Z]\d+)\s*[︰::]\s*(.*?)\s*"
|
||||||
|
r"(?:\n\s*答\s*\1\s*[︰::]\s*(.*?)\s*)"
|
||||||
|
r"(?=\n\s*(?:問\s*[A-Z]\d+\s*[︰::]|$))",
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def split_chinese_qa(text: str) -> List[Section]:
|
||||||
|
"""Regex fast-pass for 問/答 format. Returns empty list if no matches found."""
|
||||||
|
sections: List[Section] = []
|
||||||
|
for m in _CN_QA_RE.finditer(text):
|
||||||
|
qa_id = m.group(1)
|
||||||
|
question = m.group(2).strip()
|
||||||
|
answer = (m.group(3) or "").strip()
|
||||||
|
sections.append(Section(
|
||||||
|
type="qa",
|
||||||
|
qa_id=qa_id,
|
||||||
|
question=question,
|
||||||
|
answer=answer,
|
||||||
|
))
|
||||||
|
return sections
|
||||||
|
|
||||||
|
|
||||||
|
_EN_QA_RE = re.compile(
|
||||||
|
r"^(Q\d+)\s+(.*?)\s*$\n((?:(?!^Q\d+).+(?:\n|$))*)",
|
||||||
|
re.MULTILINE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def split_english_qa(text: str) -> List[Section]:
|
||||||
|
"""Regex fast-pass for Q-number format. Returns empty list if no matches found."""
|
||||||
|
sections: List[Section] = []
|
||||||
|
for m in _EN_QA_RE.finditer(text):
|
||||||
|
qa_id = m.group(1)
|
||||||
|
question = m.group(2).strip()
|
||||||
|
answer = m.group(3).strip()
|
||||||
|
sections.append(Section(
|
||||||
|
type="qa",
|
||||||
|
qa_id=qa_id,
|
||||||
|
question=question,
|
||||||
|
answer=answer,
|
||||||
|
))
|
||||||
|
return sections
|
||||||
|
|
||||||
|
|
||||||
|
def _estimate_tokens(text: str) -> int:
|
||||||
|
"""Rough token estimate: ~1.3 tokens per CJK char, ~1 token per 4 chars for Latin."""
|
||||||
|
cjk_count = 0
|
||||||
|
latin_len = 0
|
||||||
|
for ch in text:
|
||||||
|
if "\u4e00" <= ch <= "\u9fff":
|
||||||
|
cjk_count += 1
|
||||||
|
else:
|
||||||
|
latin_len += 1
|
||||||
|
return int(cjk_count * 1.3 + latin_len / 4)
|
||||||
|
|
||||||
|
|
||||||
|
def _split_oversized_qa(
|
||||||
|
question: str, answer: str, page: int, heading: str,
|
||||||
|
qa_id: Optional[str], question_index: int, has_table: bool,
|
||||||
|
parent_topic: str, start_page: int, end_page: int,
|
||||||
|
max_tokens: int,
|
||||||
|
) -> List[Tuple[str, int, dict]]:
|
||||||
|
"""Recursively split an oversized Q&A answer with question prepended to each sub-chunk."""
|
||||||
|
# Try paragraph boundaries first
|
||||||
|
parts = answer.split("\n\n")
|
||||||
|
if len(parts) <= 1:
|
||||||
|
parts = answer.split("\n")
|
||||||
|
|
||||||
|
# Group parts into sub-chunks that fit within max_tokens
|
||||||
|
sub_chunks: List[str] = []
|
||||||
|
current = ""
|
||||||
|
for part in parts:
|
||||||
|
candidate = (current + "\n\n" + part) if current else part
|
||||||
|
if _estimate_tokens(f"Question: {question}\n\nAnswer (part 1/N): {candidate}") > max_tokens and current:
|
||||||
|
sub_chunks.append(current)
|
||||||
|
current = part
|
||||||
|
else:
|
||||||
|
current = candidate
|
||||||
|
if current:
|
||||||
|
sub_chunks.append(current)
|
||||||
|
|
||||||
|
total = len(sub_chunks)
|
||||||
|
results: List[Tuple[str, int, dict]] = []
|
||||||
|
for i, sub in enumerate(sub_chunks):
|
||||||
|
chunk_text = f"Question: {question}\n\nAnswer (part {i + 1}/{total}): {sub}"
|
||||||
|
meta = {
|
||||||
|
"strategy_type": "question",
|
||||||
|
"section_type": "qa",
|
||||||
|
"question_index": question_index,
|
||||||
|
"question_id": qa_id,
|
||||||
|
"question_text": question,
|
||||||
|
"section_heading": heading,
|
||||||
|
"answer_contains_table": has_table,
|
||||||
|
"source_page_range": [start_page, end_page],
|
||||||
|
"parent_topic": parent_topic,
|
||||||
|
}
|
||||||
|
results.append((chunk_text, page, meta))
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def build_chunks_from_sections(
|
||||||
|
sections: List[Section], max_tokens: int = 3000,
|
||||||
|
) -> List[Tuple[str, int, dict]]:
|
||||||
|
"""Build chunk texts + page refs + metadata from sections.
|
||||||
|
|
||||||
|
Returns List[(chunk_text, page_number, metadata_dict)].
|
||||||
|
"""
|
||||||
|
chunks: List[Tuple[str, int, dict]] = []
|
||||||
|
qa_index = 0
|
||||||
|
|
||||||
|
for section in sections:
|
||||||
|
if section.type in ("toc", "heading_only"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if section.type == "qa":
|
||||||
|
question_text = section.question or ""
|
||||||
|
answer_text = section.answer or ""
|
||||||
|
chunk_text = f"Question: {question_text}\n\nAnswer: {answer_text}"
|
||||||
|
|
||||||
|
if section.heading:
|
||||||
|
chunk_text = f"[{section.heading}]\n{chunk_text}"
|
||||||
|
|
||||||
|
page = section.start_page
|
||||||
|
meta: Dict = {
|
||||||
|
"strategy_type": "question",
|
||||||
|
"section_type": "qa",
|
||||||
|
"question_index": qa_index,
|
||||||
|
"question_id": section.qa_id,
|
||||||
|
"question_text": question_text,
|
||||||
|
"section_heading": section.heading,
|
||||||
|
"answer_contains_table": section.has_table,
|
||||||
|
"source_page_range": [section.start_page, section.end_page],
|
||||||
|
"parent_topic": section.parent_topic,
|
||||||
|
}
|
||||||
|
|
||||||
|
if _estimate_tokens(chunk_text) > max_tokens:
|
||||||
|
chunks.extend(_split_oversized_qa(
|
||||||
|
question=question_text,
|
||||||
|
answer=answer_text,
|
||||||
|
page=page,
|
||||||
|
heading=section.heading,
|
||||||
|
qa_id=section.qa_id,
|
||||||
|
question_index=qa_index,
|
||||||
|
has_table=section.has_table,
|
||||||
|
parent_topic=section.parent_topic,
|
||||||
|
start_page=section.start_page,
|
||||||
|
end_page=section.end_page,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
chunks.append((chunk_text, page, meta))
|
||||||
|
|
||||||
|
qa_index += 1
|
||||||
|
|
||||||
|
elif section.type == "narrative":
|
||||||
|
content = section.content
|
||||||
|
if not content.strip():
|
||||||
|
continue
|
||||||
|
prefix = f"[{section.heading}]\n" if section.heading else ""
|
||||||
|
chunk_text = f"{prefix}{content}"
|
||||||
|
meta = {
|
||||||
|
"strategy_type": "question",
|
||||||
|
"section_type": "narrative",
|
||||||
|
"section_heading": section.heading,
|
||||||
|
"source_page_range": [section.start_page, section.end_page],
|
||||||
|
}
|
||||||
|
if _estimate_tokens(chunk_text) <= max_tokens:
|
||||||
|
chunks.append((chunk_text, section.start_page, meta))
|
||||||
|
else:
|
||||||
|
paragraphs = content.split("\n\n")
|
||||||
|
current = ""
|
||||||
|
for para in paragraphs:
|
||||||
|
candidate = (current + "\n\n" + para) if current else para
|
||||||
|
full = f"{prefix}{candidate}"
|
||||||
|
if _estimate_tokens(full) > max_tokens and current:
|
||||||
|
chunks.append((f"{prefix}{current}", section.start_page, dict(meta)))
|
||||||
|
current = para
|
||||||
|
else:
|
||||||
|
current = candidate
|
||||||
|
if current:
|
||||||
|
chunks.append((f"{prefix}{current}", section.start_page, dict(meta)))
|
||||||
|
|
||||||
|
elif section.type == "speaking_notes":
|
||||||
|
content = section.content
|
||||||
|
if not content.strip():
|
||||||
|
continue
|
||||||
|
bullets = re.split(r"(?=⚫)", content)
|
||||||
|
bullets = [b.strip() for b in bullets if b.strip()]
|
||||||
|
if not bullets:
|
||||||
|
bullets = [content]
|
||||||
|
prefix = f"[{section.heading}]\n" if section.heading else ""
|
||||||
|
for bullet in bullets:
|
||||||
|
chunk_text = f"{prefix}{bullet}"
|
||||||
|
meta = {
|
||||||
|
"strategy_type": "question",
|
||||||
|
"section_type": "speaking_notes",
|
||||||
|
"section_heading": section.heading,
|
||||||
|
"source_page_range": [section.start_page, section.end_page],
|
||||||
|
}
|
||||||
|
chunks.append((chunk_text, section.start_page, meta))
|
||||||
|
|
||||||
|
elif section.type == "table":
|
||||||
|
content = section.content
|
||||||
|
if not content.strip():
|
||||||
|
continue
|
||||||
|
chunk_text = f"[{section.heading}]\n{content}" if section.heading else content
|
||||||
|
meta = {
|
||||||
|
"strategy_type": "question",
|
||||||
|
"section_type": "table",
|
||||||
|
"section_heading": section.heading,
|
||||||
|
"answer_contains_table": True,
|
||||||
|
"source_page_range": [section.start_page, section.end_page],
|
||||||
|
}
|
||||||
|
chunks.append((chunk_text, section.start_page, meta))
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
@ -0,0 +1,147 @@
|
||||||
|
"""Table extraction utilities for Package 8.
|
||||||
|
|
||||||
|
Provides vision-based and text-based table detection and markdown conversion
|
||||||
|
for LegCo documents. Uses the existing LLM model (vision-capable) for
|
||||||
|
table-to-markdown conversion.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_CACHE_DIR = Path(__file__).resolve().parent.parent.parent / ".cache" / "vision_tables"
|
||||||
|
|
||||||
|
|
||||||
|
async def extract_tables_vision(page_images: List[str], llm_client) -> List[str]:
|
||||||
|
"""Send page images to vision LLM, get back markdown tables.
|
||||||
|
|
||||||
|
Each page_image is a base64-encoded PNG string.
|
||||||
|
Uses the existing LLM model which supports vision input.
|
||||||
|
"""
|
||||||
|
results: List[str] = []
|
||||||
|
prompt = (
|
||||||
|
"Convert this page to Markdown. For any tables:\n"
|
||||||
|
"- Use proper markdown table syntax with |---|---| alignment\n"
|
||||||
|
"- Preserve all column headers and row labels\n"
|
||||||
|
"- Do not modify or translate the content\n"
|
||||||
|
"- If a table spans multiple pages, note it"
|
||||||
|
)
|
||||||
|
for idx, img_b64 in enumerate(page_images):
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": prompt},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {"url": f"data:image/png;base64,{img_b64}"},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
try:
|
||||||
|
response = await llm_client._client.chat.completions.create(
|
||||||
|
model=llm_client.model,
|
||||||
|
messages=messages,
|
||||||
|
temperature=0.1,
|
||||||
|
)
|
||||||
|
content = response.choices[0].message.content or ""
|
||||||
|
if content.strip():
|
||||||
|
results.append(content.strip())
|
||||||
|
except Exception:
|
||||||
|
logger.warning("Vision table extraction failed for page image %d", idx, exc_info=True)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
_TABLE_HEURISTIC_RE = [
|
||||||
|
r"(?:\|[\s\-:]+\|)",
|
||||||
|
r"(?:\+[-=]+\+)",
|
||||||
|
r"(?:(?:\S+\s{2,}){3,}\n)",
|
||||||
|
]
|
||||||
|
|
||||||
|
_TABLE_REGION_PROMPT = (
|
||||||
|
"Convert this raw table text extracted from a PDF into a markdown table.\n"
|
||||||
|
"Preserve all data exactly. Detect column boundaries and alignment.\n\n"
|
||||||
|
"{table_text}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def extract_tables_text(text: str, llm_client) -> List[str]:
|
||||||
|
"""Detect table-like text regions, send to LLM for markdown conversion."""
|
||||||
|
import re
|
||||||
|
|
||||||
|
regions: List[str] = []
|
||||||
|
lines = text.split("\n")
|
||||||
|
current_region: List[str] = []
|
||||||
|
in_table = False
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
is_table_line = any(re.search(pat, line) for pat in _TABLE_HEURISTIC_RE)
|
||||||
|
if is_table_line:
|
||||||
|
in_table = True
|
||||||
|
current_region.append(line)
|
||||||
|
elif in_table and line.strip():
|
||||||
|
current_region.append(line)
|
||||||
|
else:
|
||||||
|
if len(current_region) >= 3:
|
||||||
|
regions.append("\n".join(current_region))
|
||||||
|
current_region = []
|
||||||
|
in_table = False
|
||||||
|
|
||||||
|
if len(current_region) >= 3:
|
||||||
|
regions.append("\n".join(current_region))
|
||||||
|
|
||||||
|
if not regions:
|
||||||
|
return []
|
||||||
|
|
||||||
|
results: List[str] = []
|
||||||
|
for region in regions:
|
||||||
|
prompt = _TABLE_REGION_PROMPT.format(table_text=region)
|
||||||
|
try:
|
||||||
|
response = await llm_client.complete(prompt, temperature=0.1, step_name="TableExtraction")
|
||||||
|
if response.strip():
|
||||||
|
results.append(response.strip())
|
||||||
|
except Exception:
|
||||||
|
logger.warning("Text-based table extraction failed", exc_info=True)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def inject_tables_into_answer(answer: str, tables_md: List[str]) -> str:
|
||||||
|
"""Replace raw table text regions in answer with markdown tables."""
|
||||||
|
if not tables_md:
|
||||||
|
return answer
|
||||||
|
result = answer
|
||||||
|
for table_md in tables_md:
|
||||||
|
lines = table_md.split("\n")
|
||||||
|
if not lines:
|
||||||
|
continue
|
||||||
|
header_line = lines[0]
|
||||||
|
if header_line.strip() in result:
|
||||||
|
result = result.replace(header_line.strip(), table_md)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def cache_vision_result(page_hash: str) -> Optional[str]:
|
||||||
|
"""Simple disk cache: hash→markdown stored in .cache dir. Returns None on miss."""
|
||||||
|
cache_file = _CACHE_DIR / f"{page_hash}.md"
|
||||||
|
if cache_file.exists():
|
||||||
|
return cache_file.read_text(encoding="utf-8")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def save_vision_result(page_hash: str, markdown: str) -> None:
|
||||||
|
"""Save a vision result to the disk cache."""
|
||||||
|
_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
cache_file = _CACHE_DIR / f"{page_hash}.md"
|
||||||
|
cache_file.write_text(markdown, encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
def compute_page_hash(page_image_b64: str) -> str:
|
||||||
|
"""Compute a hash for a page image for cache key purposes."""
|
||||||
|
return hashlib.sha256(page_image_b64.encode("utf-8")).hexdigest()[:16]
|
||||||
|
|
@ -56,9 +56,32 @@ export const ChunkList: React.FC<ChunkListProps> = ({
|
||||||
<span className="text-xs font-medium text-gray-500 uppercase">
|
<span className="text-xs font-medium text-gray-500 uppercase">
|
||||||
Chunk {chunk.chunk_index}
|
Chunk {chunk.chunk_index}
|
||||||
</span>
|
</span>
|
||||||
<span className="text-xs text-gray-400">
|
{chunk.strategy_type === 'question' && chunk.question_id ? (
|
||||||
Page: {chunk.page_number !== null ? chunk.page_number : 'N/A'}
|
<>
|
||||||
</span>
|
<span className="text-xs text-gray-600">
|
||||||
|
Q: {chunk.question_id}{chunk.question_text ? ` — ${chunk.question_text.length > 80 ? `${chunk.question_text.slice(0, 80)}...` : chunk.question_text}` : ''}
|
||||||
|
</span>
|
||||||
|
{chunk.topic_section && (
|
||||||
|
<span className="text-xs text-gray-500">
|
||||||
|
Topic: {chunk.topic_section}
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
{chunk.source_page_range && chunk.source_page_range.length === 2 && (
|
||||||
|
<span className="text-xs text-gray-400">
|
||||||
|
Pages {chunk.source_page_range[0]}-{chunk.source_page_range[1]}
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
{chunk.has_table && (
|
||||||
|
<span className="inline-flex items-center px-1.5 py-0.5 rounded text-xs font-medium bg-amber-100 text-amber-700">
|
||||||
|
Contains table
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</>
|
||||||
|
) : (
|
||||||
|
<span className="text-xs text-gray-400">
|
||||||
|
Page: {chunk.page_number !== null ? chunk.page_number : 'N/A'}
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
<div className="text-sm text-gray-700 truncate" title={chunk.content_summary}>
|
<div className="text-sm text-gray-700 truncate" title={chunk.content_summary}>
|
||||||
{chunk.content_summary.length > 100
|
{chunk.content_summary.length > 100
|
||||||
|
|
@ -67,7 +90,12 @@ export const ChunkList: React.FC<ChunkListProps> = ({
|
||||||
</div>
|
</div>
|
||||||
{chunk.chunk_file_path && (
|
{chunk.chunk_file_path && (
|
||||||
<a
|
<a
|
||||||
href={getPdfViewerUrl(chunk.chunk_file_path, chunk.page_number ?? undefined)}
|
href={getPdfViewerUrl(
|
||||||
|
chunk.chunk_file_path,
|
||||||
|
chunk.strategy_type === 'question' && chunk.source_page_range && chunk.source_page_range.length > 0
|
||||||
|
? chunk.source_page_range[0]
|
||||||
|
: chunk.page_number ?? undefined
|
||||||
|
)}
|
||||||
target="_blank"
|
target="_blank"
|
||||||
rel="noopener noreferrer"
|
rel="noopener noreferrer"
|
||||||
className="inline-flex items-center mt-1 text-xs text-blue-600 hover:text-blue-800 hover:underline"
|
className="inline-flex items-center mt-1 text-xs text-blue-600 hover:text-blue-800 hover:underline"
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,18 @@ export const DocumentList: React.FC<DocumentListProps> = ({
|
||||||
<div className="flex items-center space-x-3 flex-1">
|
<div className="flex items-center space-x-3 flex-1">
|
||||||
<FileText className="w-5 h-5 text-gray-500 flex-shrink-0" />
|
<FileText className="w-5 h-5 text-gray-500 flex-shrink-0" />
|
||||||
<div className="flex-1 min-w-0">
|
<div className="flex-1 min-w-0">
|
||||||
<div className="font-medium text-gray-900 truncate">{doc.filename}</div>
|
<div className="flex items-center space-x-2">
|
||||||
|
<span className="font-medium text-gray-900 truncate">{doc.filename}</span>
|
||||||
|
{doc.chunking_strategy === 'question' ? (
|
||||||
|
<span className="inline-flex items-center px-2 py-0.5 rounded text-xs font-medium bg-blue-100 text-blue-700">
|
||||||
|
chunked by question
|
||||||
|
</span>
|
||||||
|
) : (
|
||||||
|
<span className="inline-flex items-center px-2 py-0.5 rounded text-xs font-medium bg-gray-100 text-gray-700">
|
||||||
|
chunked by token
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
<div className="text-sm text-gray-500">
|
<div className="text-sm text-gray-500">
|
||||||
{doc.chunk_count} chunks • Uploaded {doc.upload_date}
|
{doc.chunk_count} chunks • Uploaded {doc.upload_date}
|
||||||
</div>
|
</div>
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
import axios from 'axios'
|
import axios from 'axios'
|
||||||
import type { QueryRequest, QueryResponse, QueryStreamEvent, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, FullTranscriptResponse, VideoUploadResponse } from '../types'
|
import type { ChunkingStrategy, QueryRequest, QueryResponse, QueryStreamEvent, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, FullTranscriptResponse, VideoUploadResponse } from '../types'
|
||||||
|
|
||||||
const BASE_URL: string = import.meta.env.VITE_API_BASE_URL ?? 'http://localhost:8000/api/v1'
|
const BASE_URL: string = import.meta.env.VITE_API_BASE_URL ?? 'http://localhost:8000/api/v1'
|
||||||
|
|
||||||
|
|
@ -48,10 +48,10 @@ export const queryDocumentStream = async (
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export const ingestDocument = async (file: File): Promise<IngestResponse> => {
|
export const ingestDocument = async (file: File, strategy: ChunkingStrategy = 'token'): Promise<IngestResponse> => {
|
||||||
const form = new FormData()
|
const form = new FormData()
|
||||||
form.append('file', file)
|
form.append('file', file)
|
||||||
const resp = await apiClient.post<IngestResponse>('/ingest', form, {
|
const resp = await apiClient.post<IngestResponse>(`/ingest?strategy=${strategy}`, form, {
|
||||||
headers: { 'Content-Type': 'multipart/form-data' },
|
headers: { 'Content-Type': 'multipart/form-data' },
|
||||||
})
|
})
|
||||||
return resp.data
|
return resp.data
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
import React from 'react'
|
import React from 'react'
|
||||||
import { QueryClient, QueryClientProvider, useMutation, useQuery, useQueryClient } from '@tanstack/react-query'
|
import { QueryClient, QueryClientProvider, useMutation, useQuery, useQueryClient } from '@tanstack/react-query'
|
||||||
import { queryDocument, queryDocumentStream, ingestDocument, listDocuments, listChunks, deleteDocument, deleteChunk, listPromptProfiles, getPromptProfile, activatePromptProfile, updatePrompt, updateAllPrompts, resetPrompts, exportProfile, importProfile, listQueryHistory, getQueryHistoryDetail, deleteQueryHistory, clearQueryHistory, getHistoryStats, uploadVideo } from './api'
|
import { queryDocument, queryDocumentStream, ingestDocument, listDocuments, listChunks, deleteDocument, deleteChunk, listPromptProfiles, getPromptProfile, activatePromptProfile, updatePrompt, updateAllPrompts, resetPrompts, exportProfile, importProfile, listQueryHistory, getQueryHistoryDetail, deleteQueryHistory, clearQueryHistory, getHistoryStats, uploadVideo } from './api'
|
||||||
import type { QueryRequest, QueryResponse, QueryStreamEvent, SourceMetadata, SubQuestionSources, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, VideoUploadResponse } from '../types'
|
import type { QueryRequest, QueryResponse, QueryStreamEvent, SourceMetadata, SubQuestionSources, ChunkingStrategy, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, VideoUploadResponse } from '../types'
|
||||||
import { useState, useCallback, useRef } from 'react'
|
import { useState, useCallback, useRef } from 'react'
|
||||||
|
|
||||||
export const queryClient = new QueryClient()
|
export const queryClient = new QueryClient()
|
||||||
|
|
@ -185,8 +185,8 @@ export const useQueryDocumentStream = () => {
|
||||||
}
|
}
|
||||||
|
|
||||||
export const useIngestDocument = () => {
|
export const useIngestDocument = () => {
|
||||||
return useMutation<IngestResponse, Error, File>({
|
return useMutation<IngestResponse, Error, { file: File; strategy: ChunkingStrategy }>({
|
||||||
mutationFn: ingestDocument,
|
mutationFn: ({ file, strategy }) => ingestDocument(file, strategy),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,11 @@
|
||||||
import React, { useState, useCallback, useMemo } from 'react'
|
import React, { useState, useCallback, useMemo } from 'react'
|
||||||
import { Database, AlertCircle, CheckCircle, XCircle, Loader2 } from 'lucide-react'
|
import { Database, AlertCircle, CheckCircle, XCircle, Loader2, Type, MessageSquare } from 'lucide-react'
|
||||||
import { useQueryClient } from '@tanstack/react-query'
|
import { useQueryClient } from '@tanstack/react-query'
|
||||||
import { useDocuments, useDocumentChunks, useDeleteDocument, useDeleteChunk, useIngestDocument } from '../lib/queries'
|
import { useDocuments, useDocumentChunks, useDeleteDocument, useDeleteChunk, useIngestDocument } from '../lib/queries'
|
||||||
import { DocumentList } from '../components/DocumentList'
|
import { DocumentList } from '../components/DocumentList'
|
||||||
import { ChunkList } from '../components/ChunkList'
|
import { ChunkList } from '../components/ChunkList'
|
||||||
import { DocumentUpload } from '../components/DocumentUpload'
|
import { DocumentUpload } from '../components/DocumentUpload'
|
||||||
|
import type { ChunkingStrategy } from '../types'
|
||||||
|
|
||||||
interface FileUploadEntry {
|
interface FileUploadEntry {
|
||||||
name: string
|
name: string
|
||||||
|
|
@ -22,6 +23,7 @@ export const RAGDatabasePage: React.FC = () => {
|
||||||
const initialDocId = useMemo(() => getDocumentIdFromUrl(), [])
|
const initialDocId = useMemo(() => getDocumentIdFromUrl(), [])
|
||||||
const [expandedId, setExpandedId] = useState<string | null>(initialDocId)
|
const [expandedId, setExpandedId] = useState<string | null>(initialDocId)
|
||||||
const [uploadEntries, setUploadEntries] = useState<FileUploadEntry[]>([])
|
const [uploadEntries, setUploadEntries] = useState<FileUploadEntry[]>([])
|
||||||
|
const [chunkingStrategy, setChunkingStrategy] = useState<ChunkingStrategy>('token')
|
||||||
|
|
||||||
const { data: documentsData, isLoading: isLoadingDocuments, error: documentsError } = useDocuments()
|
const { data: documentsData, isLoading: isLoadingDocuments, error: documentsError } = useDocuments()
|
||||||
const { data: chunks, isLoading: isLoadingChunks } = useDocumentChunks(expandedId)
|
const { data: chunks, isLoading: isLoadingChunks } = useDocumentChunks(expandedId)
|
||||||
|
|
@ -60,7 +62,7 @@ export const RAGDatabasePage: React.FC = () => {
|
||||||
const results = await Promise.allSettled(
|
const results = await Promise.allSettled(
|
||||||
files.map(async (file) => {
|
files.map(async (file) => {
|
||||||
try {
|
try {
|
||||||
await ingestDocumentMutation.mutateAsync(file)
|
await ingestDocumentMutation.mutateAsync({ file, strategy: chunkingStrategy })
|
||||||
setUploadEntries((prev) =>
|
setUploadEntries((prev) =>
|
||||||
prev.map((e) =>
|
prev.map((e) =>
|
||||||
e.name === file.name ? { ...e, status: 'success' as const } : e
|
e.name === file.name ? { ...e, status: 'success' as const } : e
|
||||||
|
|
@ -80,7 +82,7 @@ export const RAGDatabasePage: React.FC = () => {
|
||||||
|
|
||||||
queryClient.invalidateQueries({ queryKey: ['documents'] })
|
queryClient.invalidateQueries({ queryKey: ['documents'] })
|
||||||
setTimeout(() => setUploadEntries([]), 5000)
|
setTimeout(() => setUploadEntries([]), 5000)
|
||||||
}, [ingestDocumentMutation, queryClient])
|
}, [ingestDocumentMutation, queryClient, chunkingStrategy])
|
||||||
|
|
||||||
const uploadingCount = uploadEntries.filter((e) => e.status === 'uploading').length
|
const uploadingCount = uploadEntries.filter((e) => e.status === 'uploading').length
|
||||||
const successCount = uploadEntries.filter((e) => e.status === 'success').length
|
const successCount = uploadEntries.filter((e) => e.status === 'success').length
|
||||||
|
|
@ -127,6 +129,42 @@ export const RAGDatabasePage: React.FC = () => {
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<div className="mt-3 flex items-center space-x-4">
|
||||||
|
<span className="text-sm font-medium text-gray-700">Chunking strategy:</span>
|
||||||
|
<div className="flex items-center space-x-3">
|
||||||
|
<label className="flex items-center space-x-2 cursor-pointer">
|
||||||
|
<input
|
||||||
|
type="radio"
|
||||||
|
name="chunking-strategy"
|
||||||
|
value="token"
|
||||||
|
checked={chunkingStrategy === 'token'}
|
||||||
|
onChange={() => setChunkingStrategy('token')}
|
||||||
|
className="h-4 w-4 text-blue-600 border-gray-300 focus:ring-blue-500"
|
||||||
|
/>
|
||||||
|
<Type className="w-4 h-4 text-gray-500" />
|
||||||
|
<div>
|
||||||
|
<span className="text-sm font-medium text-gray-900">Chunk by Token</span>
|
||||||
|
<span className="text-xs text-gray-500 ml-1">Split by token windows with overlap</span>
|
||||||
|
</div>
|
||||||
|
</label>
|
||||||
|
<label className="flex items-center space-x-2 cursor-pointer">
|
||||||
|
<input
|
||||||
|
type="radio"
|
||||||
|
name="chunking-strategy"
|
||||||
|
value="question"
|
||||||
|
checked={chunkingStrategy === 'question'}
|
||||||
|
onChange={() => setChunkingStrategy('question')}
|
||||||
|
className="h-4 w-4 text-blue-600 border-gray-300 focus:ring-blue-500"
|
||||||
|
/>
|
||||||
|
<MessageSquare className="w-4 h-4 text-gray-500" />
|
||||||
|
<div>
|
||||||
|
<span className="text-sm font-medium text-gray-900">Chunk by Question</span>
|
||||||
|
<span className="text-xs text-gray-500 ml-1">Detect Q&A pairs, extract tables</span>
|
||||||
|
</div>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
{hasEntries && (
|
{hasEntries && (
|
||||||
<div className="mt-4 space-y-2">
|
<div className="mt-4 space-y-2">
|
||||||
<div className="text-sm font-medium text-gray-600">
|
<div className="text-sm font-medium text-gray-600">
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue