diff --git a/backend/app/models/documents.py b/backend/app/models/documents.py index 6477588..307b12b 100644 --- a/backend/app/models/documents.py +++ b/backend/app/models/documents.py @@ -8,6 +8,7 @@ class DocumentInfo(BaseModel): filename: str chunk_count: int upload_date: str + chunking_strategy: str = "token" class ChunkInfo(BaseModel): @@ -16,6 +17,14 @@ class ChunkInfo(BaseModel): content_summary: str page_number: Optional[int] = None chunk_file_path: Optional[str] = None + strategy_type: Optional[str] = None + question_index: Optional[int] = None + question_id: Optional[str] = None + question_text: Optional[str] = None + section_heading: Optional[str] = None + answer_contains_table: Optional[bool] = None + source_page_range: Optional[List[int]] = None + parent_topic: Optional[str] = None class DocumentListResponse(BaseModel): diff --git a/backend/app/routers/ingest.py b/backend/app/routers/ingest.py index 2b2d8d7..163547d 100644 --- a/backend/app/routers/ingest.py +++ b/backend/app/routers/ingest.py @@ -5,9 +5,9 @@ import tempfile import uuid from pathlib import Path -from fastapi import APIRouter, UploadFile, File, HTTPException +from fastapi import APIRouter, UploadFile, File, HTTPException, Query -from app.models.ingest import IngestResponse +from app.models.ingest import IngestResponse, VALID_CHUNKING_STRATEGIES logger = logging.getLogger(__name__) router = APIRouter(tags=["ingest"]) @@ -37,11 +37,14 @@ def _delete_existing_document(rag, filename: str, chunk_dir: str) -> None: @router.post("/ingest", response_model=IngestResponse) -async def ingest_document(file: UploadFile = File(...)): +async def ingest_document( + file: UploadFile = File(...), + strategy: str = Query("token"), +): """Ingest a document into the RAG system.""" from app.core.config import get_settings from app.services.rag import RAGService - from app.utils.chunking import TokenChunkingStrategy + from app.utils.chunking import get_chunking_strategy from app.utils.metadata import extract_metadata filename = file.filename or "unknown" @@ -53,6 +56,12 @@ async def ingest_document(file: UploadFile = File(...)): detail=f"Unsupported file format: {file_ext}. Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}", ) + if strategy not in VALID_CHUNKING_STRATEGIES: + raise HTTPException( + status_code=400, + detail=f"Invalid chunking strategy: {strategy}. Valid: {', '.join(sorted(VALID_CHUNKING_STRATEGIES))}", + ) + settings = get_settings() temp_path = None try: @@ -68,9 +77,7 @@ async def ingest_document(file: UploadFile = File(...)): _delete_existing_document(rag, filename, chunk_dir) document_id = str(uuid.uuid4()) - chunker = TokenChunkingStrategy( - chunk_size=settings.chunk_size, overlap=settings.chunk_overlap - ) + chunker = get_chunking_strategy(strategy, settings) if file_ext == ".pdf": from app.utils.pdf_parser import parse_pdf_by_page @@ -105,6 +112,8 @@ async def ingest_document(file: UploadFile = File(...)): ) chunk_file_paths.append(None) + chunk_metadata = chunker._chunk_metadata if hasattr(chunker, '_chunk_metadata') else None + metadata = extract_metadata( temp_path, chunk_texts, @@ -112,6 +121,8 @@ async def ingest_document(file: UploadFile = File(...)): page_numbers=page_numbers, chunk_file_paths=chunk_file_paths, document_id=document_id, + strategy_type=strategy, + chunk_metadata=chunk_metadata, ) rag.ingest_document(temp_path, chunk_texts, metadata, document_id=document_id) @@ -145,9 +156,12 @@ async def ingest_document(file: UploadFile = File(...)): ) chunk_file_paths.append(None) + chunk_metadata = chunker._chunk_metadata if hasattr(chunker, '_chunk_metadata') else None + metadata = extract_metadata( temp_path, chunks, original_filename=filename, chunk_file_paths=chunk_file_paths, document_id=document_id, + strategy_type=strategy, chunk_metadata=chunk_metadata, ) rag.ingest_document(temp_path, chunks, metadata, document_id=document_id) @@ -180,9 +194,12 @@ async def ingest_document(file: UploadFile = File(...)): ) chunk_file_paths.append(None) + chunk_metadata = chunker._chunk_metadata if hasattr(chunker, '_chunk_metadata') else None + metadata = extract_metadata( temp_path, chunks, original_filename=filename, chunk_file_paths=chunk_file_paths, document_id=document_id, + strategy_type=strategy, chunk_metadata=chunk_metadata, ) rag.ingest_document(temp_path, chunks, metadata, document_id=document_id) @@ -193,6 +210,7 @@ async def ingest_document(file: UploadFile = File(...)): document_id=document_id, chunk_count=chunk_count, filename=filename, + strategy=strategy, ) except HTTPException: diff --git a/backend/app/test/test_phase8_ingest.py b/backend/app/test/test_phase8_ingest.py new file mode 100644 index 0000000..8c3e892 --- /dev/null +++ b/backend/app/test/test_phase8_ingest.py @@ -0,0 +1,209 @@ +"""Phase 8 tests: Ingest API integration with strategy selection (Sub-Phase 8.3). + +Covers: +- POST /api/v1/api/v1/ingest?strategy=token — existing behavior unchanged +- POST /api/v1/api/v1/ingest?strategy=question — Q&A chunking applied +- Invalid strategy values return 400 +- IngestResponse includes strategy field +- DOCX with Q&A format uses question strategy +- Document without Q&A falls back gracefully +""" +import io +import json +from typing import List, Tuple +from unittest.mock import MagicMock + +import pytest +from fastapi import FastAPI +from fastapi.testclient import TestClient +from pypdf import PdfWriter + +from app.routers.ingest import router + + +class _DeterministicEmbedding: + def name(self) -> str: + return "test_deterministic" + + def __call__(self, input): + return self._embed(input) + + def embed_query(self, input): + return self._embed(input) + + @staticmethod + def _embed(texts): + vectors = [] + for text in texts: + vec = [0.0] * 384 + for i, ch in enumerate(text[:384]): + vec[i] = ord(ch) / 1000.0 + vectors.append(vec) + return vectors + + +def _create_real_pdf(content: str) -> bytes: + writer = PdfWriter() + writer.add_blank_page(width=200, height=200) + buf = io.BytesIO() + writer.write(buf) + return buf.getvalue() + + +def _create_text_txt(content: str) -> bytes: + return content.encode("utf-8") + + +@pytest.fixture +def client(tmp_path, monkeypatch): + """TestClient with real ChromaDB isolated in tmp_path + deterministic embeddings.""" + chroma_path = str(tmp_path / "chroma_db") + chunk_path = str(tmp_path / "document_chunk") + prompts_path = str(tmp_path / "prompts.db") + history_path = str(tmp_path / "history.db") + + monkeypatch.setenv("CHROMA_DB_PATH", chroma_path) + monkeypatch.setenv("DOCUMENT_CHUNK_PATH", chunk_path) + monkeypatch.setenv("PROMPTS_DB_PATH", prompts_path) + monkeypatch.setenv("HISTORY_DB_PATH", history_path) + monkeypatch.setenv("EMBEDDING_MODEL", "test-mock") + monkeypatch.setenv("LLM_API_KEY", "test-key") + + from app.core.config import get_settings + get_settings.cache_clear() + from app.core.dependencies import get_settings_cached + get_settings_cached.cache_clear() + + from app.core.sqlite_db import _get_db, init_prompts_db, init_history_db, seed_default_profiles + conn = _get_db(prompts_path) + init_prompts_db(conn) + seed_default_profiles(conn) + conn.close() + + hconn = _get_db(history_path) + init_history_db(hconn) + hconn.close() + + monkeypatch.setattr( + "app.core.database.get_embedding_function_settings", + lambda settings: _DeterministicEmbedding(), + ) + + test_app = FastAPI() + test_app.include_router(router, prefix="/api/v1") + + yield TestClient(test_app) + + get_settings_cached.cache_clear() + get_settings.cache_clear() + + +def test_ingest_with_strategy_token(client): + """Existing behavior unchanged: strategy=token uses TokenChunkingStrategy.""" + txt_bytes = _create_text_txt("This is a test document with enough content to generate chunks.") + resp = client.post( + "/api/v1/ingest?strategy=token", + files={"file": ("test.txt", txt_bytes, "text/plain")}, + ) + assert resp.status_code == 200 + data = resp.json() + assert data["strategy"] == "token" + assert data["chunk_count"] > 0 + + +def test_ingest_invalid_strategy_rejected(client): + """Invalid strategy values return 400.""" + txt_bytes = _create_text_txt("test") + resp = client.post( + "/api/v1/ingest?strategy=invalid", + files={"file": ("test.txt", txt_bytes, "text/plain")}, + ) + assert resp.status_code == 400 + assert "strategy" in resp.json()["detail"].lower() + + +def test_ingest_response_includes_strategy(client): + """IngestResponse includes the strategy field.""" + txt_bytes = _create_text_txt("Strategy response test content with more text to ensure chunks.") + resp = client.post( + "/api/v1/ingest?strategy=token", + files={"file": ("test.txt", txt_bytes, "text/plain")}, + ) + assert resp.status_code == 200 + assert "strategy" in resp.json() + + +def test_ingest_default_strategy_is_token(client): + """When no strategy param provided, default to token.""" + txt_bytes = _create_text_txt("Default strategy test with enough text to generate output.") + resp = client.post( + "/api/v1/ingest", + files={"file": ("test.txt", txt_bytes, "text/plain")}, + ) + assert resp.status_code == 200 + assert resp.json()["strategy"] == "token" + + +def test_ingest_question_strategy_txt(client, monkeypatch): + """TXT with Q&A format uses question strategy and produces chunks.""" + _mock_question_chunker(monkeypatch) + + txt_bytes = _create_text_txt("問A1:test question\n答A1:test answer with more text here to ensure chunking works properly.") + + resp = client.post( + "/api/v1/ingest?strategy=question", + files={"file": ("test.txt", txt_bytes, "text/plain")}, + ) + assert resp.status_code == 200 + data = resp.json() + assert data["strategy"] == "question" + assert data["chunk_count"] > 0 + + +def test_ingest_question_strategy_no_qa_fallback(client, monkeypatch): + """Document without Q&A markers falls back to narrative chunking without error.""" + _mock_question_chunker(monkeypatch) + + txt_bytes = _create_text_txt("This is plain text without any Q&A markers, but it needs to be long enough to generate at least one chunk when processed by the tokenizer.") + + resp = client.post( + "/api/v1/ingest?strategy=question", + files={"file": ("plain.txt", txt_bytes, "text/plain")}, + ) + assert resp.status_code == 200 + data = resp.json() + assert data["strategy"] == "question" + assert data["chunk_count"] > 0 + + +def _mock_question_chunker(monkeypatch): + """Replace QuestionChunkingStrategy with a mock that returns test chunks.""" + + class _MockQuestionChunker: + def __init__(self, settings=None, llm_client=None): + self._chunk_metadata = [ + { + "strategy_type": "question", + "section_type": "qa", + "question_index": 0, + "question_id": "A1", + "question_text": "What is X?", + "section_heading": "(A) Topic", + "answer_contains_table": False, + "source_page_range": [1, 2], + } + ] + self._max_tokens = 3000 + + def chunk(self, text): + self._chunk_metadata = self._chunk_metadata[:1] + return ["Question: What is X?\n\nAnswer: X is Y."] + + def chunk_pages(self, pages, overlap_tokens=0): + self._chunk_metadata = self._chunk_metadata[:1] + return [("Question: What is X?\n\nAnswer: X is Y.", 1)] + + monkeypatch.setattr( + "app.utils.chunking.QuestionChunkingStrategy", + _MockQuestionChunker, + ) diff --git a/backend/app/test/test_phase8_metadata.py b/backend/app/test/test_phase8_metadata.py new file mode 100644 index 0000000..64ca03c --- /dev/null +++ b/backend/app/test/test_phase8_metadata.py @@ -0,0 +1,149 @@ +"""Phase 8 tests: Q&A metadata enrichment (Sub-Phase 8.2). + +Covers: +- Metadata enrichment with Q&A-specific fields via chunk_metadata param +- Backward compatibility: token strategy unchanged +- Page number references question location +- Chunk metadata merging with base metadata +""" +import json + +import pytest + +from app.utils.metadata import extract_metadata + + +def test_qa_metadata_fields(tmp_path): + """strategy_type, question_index, question_id, question_text merged via chunk_metadata.""" + file_path = tmp_path / "test.pdf" + file_path.write_text("dummy content") + + chunks = ["chunk 1", "chunk 2"] + chunk_metadata = [ + { + "strategy_type": "question", + "section_type": "qa", + "question_index": 0, + "question_id": "A1", + "question_text": "What is X?", + "section_heading": "(A) Section", + "answer_contains_table": True, + "source_page_range": [2, 5], + "parent_topic": "Topic Name", + }, + { + "strategy_type": "question", + "section_type": "qa", + "question_index": 1, + "question_id": "A2", + "question_text": "What is Y?", + "section_heading": "(A) Section", + "answer_contains_table": False, + "source_page_range": [5, 7], + }, + ] + + metadata = extract_metadata( + file_path=str(file_path), + chunks=chunks, + strategy_type="question", + chunk_metadata=chunk_metadata, + ) + assert len(metadata) == 2 + + m0 = metadata[0] + assert m0["strategy_type"] == "question" + assert m0["section_type"] == "qa" + assert m0["question_index"] == 0 + assert m0["question_id"] == "A1" + assert m0["question_text"] == "What is X?" + assert m0["section_heading"] == "(A) Section" + assert m0["answer_contains_table"] is True + assert m0["source_page_range"] == [2, 5] + assert m0["parent_topic"] == "Topic Name" + + m1 = metadata[1] + assert m1["question_index"] == 1 + assert m1["question_id"] == "A2" + assert m1["answer_contains_table"] is False + + +def test_qa_metadata_topic_section(tmp_path): + """section_heading and parent_topic are both preserved.""" + file_path = tmp_path / "test.pdf" + file_path.write_text("dummy content") + + metadata = extract_metadata( + file_path=str(file_path), + chunks=["chunk"], + strategy_type="question", + chunk_metadata=[{"section_heading": "(B) Traffic", "parent_topic": "Traffic Planning"}], + ) + assert metadata[0]["section_heading"] == "(B) Traffic" + assert metadata[0]["parent_topic"] == "Traffic Planning" + + +def test_token_metadata_unchanged(tmp_path): + """Existing metadata fields unchanged for token strategy (no chunk_metadata).""" + file_path = tmp_path / "test.txt" + file_path.write_text("test content") + + metadata = extract_metadata( + file_path=str(file_path), + chunks=["chunk 1", "chunk 2"], + original_filename="original.txt", + strategy_type="token", + ) + assert len(metadata) == 2 + for m in metadata: + assert "filename" in m + assert "upload_date" in m + assert "content_summary" in m + assert "chunk_index" in m + assert m.get("strategy_type", "token") == "token" + assert "question_id" not in m + + +def test_page_number_from_question(tmp_path): + """Page ref should point to question location (pass via page_numbers from strategy).""" + file_path = tmp_path / "test.pdf" + file_path.write_text("dummy content") + + metadata = extract_metadata( + file_path=str(file_path), + chunks=["question chunk"], + page_numbers=[3], + strategy_type="question", + chunk_metadata=[{ + "question_id": "A1", + "source_page_range": [3, 8], + }], + ) + assert metadata[0]["page_number"] == 3 + assert metadata[0]["source_page_range"] == [3, 8] + + +def test_chunk_metadata_length_mismatch(tmp_path): + """chunk_metadata length mismatch with chunks raises ValueError.""" + file_path = tmp_path / "test.pdf" + file_path.write_text("dummy content") + + with pytest.raises(ValueError, match="chunk_metadata length"): + extract_metadata( + file_path=str(file_path), + chunks=["a", "b", "c"], + chunk_metadata=[{}, {}], + ) + + +def test_chunk_metadata_empty_no_error(tmp_path): + """Empty chunk_metadata list with matching chunks is valid.""" + file_path = tmp_path / "test.pdf" + file_path.write_text("dummy content") + + metadata = extract_metadata( + file_path=str(file_path), + chunks=["a"], + chunk_metadata=[], + ) + assert len(metadata) == 1 diff --git a/backend/app/test/test_phase8_qa_chunking.py b/backend/app/test/test_phase8_qa_chunking.py new file mode 100644 index 0000000..34c4598 --- /dev/null +++ b/backend/app/test/test_phase8_qa_chunking.py @@ -0,0 +1,481 @@ +"""Phase 8 tests: Q&A-pair chunking strategy (Sub-Phase 8.1). + +Covers: +- LLM structure detection response parsing (parse_llm_structure_response) +- Mixed format handling (問/答 + section headings) +- Narrative-only text (no Q&A format) +- Speaking notes (發言要點) chunking by bullet +- Regex fast-pass for Chinese 問/答 format +- Regex fast-pass for English Q1/Q2 format +- Multi-page section tracking with [PAGE_BREAK] markers +- ChunkingStrategy ABC compliance +- Page number references question (問) page, not answer +- Size limit: oversized sections recursively split with heading preserved +- build_chunks_from_sections output verification +- preprocess_text: footer stripping, colon normalization, page break insertion +""" + +import json +from typing import List, Tuple +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from app.utils.qa_chunking import ( + Section, + preprocess_text, + build_structure_detection_prompt, + parse_llm_structure_response, + split_chinese_qa, + split_english_qa, + build_chunks_from_sections, +) +from app.utils.chunking import ( + ChunkingStrategy, + QuestionChunkingStrategy, + get_chunking_strategy, +) + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture +def mock_settings(): + """Minimal Settings mock with Q&A chunking defaults.""" + s = MagicMock() + s.default_chunking_strategy = "question" + s.qa_vision_enabled = False + s.qa_max_chunk_tokens = 3000 + s.qa_structure_model = "" + s.qa_include_internal_refs = True + s.qa_cache_vision_results = True + s.chunk_size = 1000 + s.chunk_overlap = 200 + s.llm_model_name = "test-model" + s.llm_api_key = "test-key" + s.llm_base_url = "https://example.com/v1" + s.llm_timeout = 30.0 + s.llm_enable_thinking = False + s.vllm_engine = False + return s + + +SAMPLE_LLM_RESPONSE = json.dumps({ + "sections": [ + { + "type": "qa", + "heading": "(A) 排水系統", + "qa_id": "A1", + "question": "古洞北和粉嶺北新發展區的設計是否能抵禦氣候變化?", + "answer": "研究顧問已為古洞北和粉嶺北新發展區研究範圍進行了評估。", + "start_page": 2, + "end_page": 3, + "has_table": False, + "parent_topic": "排水系統", + }, + { + "type": "narrative", + "heading": "(1) 住戶的安置補償", + "content": "合資格住戶可選擇(i)「須通過經濟狀況審查」安置安排。", + "start_page": 2, + "end_page": 5, + "has_table": False, + }, + { + "type": "speaking_notes", + "heading": "發言要點", + "content": "⚫ 古洞北/粉嶺北新發展區是北部都會區內的新市鎮發展項目\n⚫ 第一期發展預計於2030年完成", + "start_page": 1, + "end_page": 2, + "has_table": False, + }, + ] +}) + + +# --------------------------------------------------------------------------- +# Test: LLM structure detection parsing +# --------------------------------------------------------------------------- + +class TestLLMStructureDetection: + + def test_llm_structure_detection(self): + """parse_llm_structure_response correctly parses LLM JSON with qa/narrative/speaking_notes.""" + sections = parse_llm_structure_response(SAMPLE_LLM_RESPONSE) + assert len(sections) == 3 + + qa = sections[0] + assert qa.type == "qa" + assert qa.qa_id == "A1" + assert qa.question == "古洞北和粉嶺北新發展區的設計是否能抵禦氣候變化?" + assert qa.answer == "研究顧問已為古洞北和粉嶺北新發展區研究範圍進行了評估。" + assert qa.start_page == 2 + assert qa.end_page == 3 + assert qa.heading == "(A) 排水系統" + assert qa.parent_topic == "排水系統" + + narr = sections[1] + assert narr.type == "narrative" + assert narr.heading == "(1) 住戶的安置補償" + assert "合資格住戶" in narr.content + + notes = sections[2] + assert notes.type == "speaking_notes" + assert "⚫" in notes.content + + def test_llm_handles_mixed_formats(self): + """Document with 問/答 markers + section headings correctly classified.""" + mixed_json = json.dumps({ + "sections": [ + { + "type": "qa", + "heading": "(B) 交通", + "qa_id": "B1", + "question": "新建道路何時通車?", + "answer": "預計2027年通車。", + "start_page": 3, + "end_page": 4, + "has_table": False, + }, + { + "type": "narrative", + "heading": "背景", + "content": "本文件說明交通規劃。", + "start_page": 1, + "end_page": 2, + "has_table": False, + }, + ] + }) + sections = parse_llm_structure_response(mixed_json) + assert len(sections) == 2 + assert sections[0].type == "qa" + assert sections[1].type == "narrative" + + def test_llm_handles_no_qa_format(self): + """Narrative-only text (like File L pages 1-13) produces only narrative sections.""" + narrative_json = json.dumps({ + "sections": [ + { + "type": "narrative", + "heading": "Introduction", + "content": "This document provides background on policy matters.", + "start_page": 1, + "end_page": 5, + "has_table": False, + }, + { + "type": "narrative", + "heading": "Analysis", + "content": "The analysis covers multiple dimensions.", + "start_page": 5, + "end_page": 13, + "has_table": False, + }, + ] + }) + sections = parse_llm_structure_response(narrative_json) + assert len(sections) == 2 + assert all(s.type == "narrative" for s in sections) + + def test_llm_handles_speaking_notes(self): + """發言要點 text with bullet points produces speaking_notes sections.""" + notes_json = json.dumps({ + "sections": [ + { + "type": "speaking_notes", + "heading": "發言要點", + "content": "⚫ 要點一:政策方向\n⚫ 要點二:實施計劃\n⚫ 要點三:預算安排", + "start_page": 1, + "end_page": 2, + "has_table": False, + }, + ] + }) + sections = parse_llm_structure_response(notes_json) + assert len(sections) == 1 + assert sections[0].type == "speaking_notes" + assert sections[0].content.count("⚫") == 3 + + def test_parse_markdown_fenced_json(self): + """parse_llm_structure_response handles ```json ... ``` wrapped responses.""" + fenced = '```json\n' + SAMPLE_LLM_RESPONSE + '\n```' + sections = parse_llm_structure_response(fenced) + assert len(sections) == 3 + + def test_parse_invalid_json_raises(self): + """parse_llm_structure_response raises ValueError on non-JSON input.""" + with pytest.raises(ValueError, match="Invalid JSON"): + parse_llm_structure_response("this is not json") + + +# --------------------------------------------------------------------------- +# Test: Regex fast-pass +# --------------------------------------------------------------------------- + +class TestRegexFastPass: + + def test_regex_fastpass_chinese(self): + """Text with 問B1/答B1 markers detected by split_chinese_qa without LLM.""" + text = ( + "(A) 排水系統\n" + "問 B1:古洞北的設計是否能抵禦氣候變化?\n" + "答 B1:研究顧問已為古洞北新發展區進行了評估。\n" + "問 B2:第二個問題是什麼?\n" + "答 B2:這是第二個問題的答案。\n" + ) + sections = split_chinese_qa(text) + assert len(sections) >= 2 + # All should be QA type + assert all(s.type == "qa" for s in sections) + # First should have question containing 古洞北 + assert "古洞北" in sections[0].question + + def test_regex_fastpass_chinese_no_match(self): + """split_chinese_qa returns empty list when no markers found.""" + text = "This is plain text without any Q&A markers." + assert split_chinese_qa(text) == [] + + def test_regex_fastpass_english(self): + """Text with Q1, Q2 markers detected by split_english_qa without LLM.""" + text = ( + "Background information here.\n\n" + "Q1 What is the timeline for the project?\n" + "The project is expected to complete by 2027.\n" + "Q2 How much will it cost?\n" + "The estimated cost is HK$500 million.\n" + ) + sections = split_english_qa(text) + assert len(sections) >= 2 + assert all(s.type == "qa" for s in sections) + assert any("timeline" in (s.question or "").lower() for s in sections) + + def test_regex_fastpass_english_no_match(self): + """split_english_qa returns empty list when no markers found.""" + text = "純中文文本沒有英文問答標記。" + assert split_english_qa(text) == [] + + +# --------------------------------------------------------------------------- +# Test: Multi-page tracking +# --------------------------------------------------------------------------- + +class TestMultiPage: + + def test_multi_page_sections(self): + """Sections with [PAGE_BREAK: N] markers spanning pages track correctly.""" + pages = [ + (1, "Header line\n(A) Water drainage\nSome intro text"), + (2, "More drainage info\nFooter text X-1"), + (3, "New section begins\n(B) Traffic planning"), + ] + text = preprocess_text(pages) + # Should have page break markers + assert "[PAGE_BREAK: 1]" in text + assert "[PAGE_BREAK: 2]" in text + assert "[PAGE_BREAK: 3]" in text + + +# --------------------------------------------------------------------------- +# Test: ABC contract +# --------------------------------------------------------------------------- + +class TestABCContract: + + def test_abc_contract(self): + """QuestionChunkingStrategy satisfies ChunkingStrategy ABC.""" + mock_settings = MagicMock() + mock_settings.qa_max_chunk_tokens = 3000 + mock_settings.qa_include_internal_refs = True + strategy = QuestionChunkingStrategy(settings=mock_settings) + assert isinstance(strategy, ChunkingStrategy) + + def test_get_chunking_strategy_factory(self, mock_settings): + """get_chunking_strategy returns correct strategy type.""" + token_strat = get_chunking_strategy("token", mock_settings) + assert isinstance(token_strat, ChunkingStrategy) + + q_strat = get_chunking_strategy("question", mock_settings) + assert isinstance(q_strat, QuestionChunkingStrategy) + + +# --------------------------------------------------------------------------- +# Test: Page number reference +# --------------------------------------------------------------------------- + +class TestPageNumberReference: + + def test_page_number_reference_question(self): + """Page ref in metadata points to question (問) page, not answer page.""" + sections = [ + Section( + type="qa", + heading="(A) Topic", + qa_id="A1", + question="What is X?", + answer="X is Y.", + start_page=5, + end_page=8, + ), + ] + chunks = build_chunks_from_sections(sections) + assert len(chunks) == 1 + chunk_text, page_num, metadata = chunks[0] + # Page number should be start_page (question location) + assert page_num == 5 + assert metadata.get("source_page_range") == [5, 8] + + +# --------------------------------------------------------------------------- +# Test: Size limit recursive split +# --------------------------------------------------------------------------- + +class TestSizeLimit: + + def test_size_limit(self): + """Oversized QA section > 3000 tokens gets recursively split with question prepended.""" + # Create a QA pair with a very long answer + long_answer = "\n\n".join(f"Paragraph {i}: " + "x" * 200 for i in range(80)) + sections = [ + Section( + type="qa", + heading="(A) Topic", + qa_id="A1", + question="What is the detailed plan?", + answer=long_answer, + start_page=2, + end_page=5, + has_table=False, + ), + ] + # Use a small max_tokens to force splitting + chunks = build_chunks_from_sections(sections, max_tokens=500) + assert len(chunks) > 1 + # Each chunk should have the question text prepended + for chunk_text, page_num, metadata in chunks: + assert "What is the detailed plan?" in chunk_text + # Page number should always be the question page + assert page_num == 2 + + +# --------------------------------------------------------------------------- +# Test: build_chunks_from_sections +# --------------------------------------------------------------------------- + +class TestBuildChunksFromSections: + + def test_build_chunks_from_sections(self): + """Verify chunk texts and metadata from sections list.""" + sections = [ + Section( + type="qa", + heading="(A) 排水系統", + qa_id="A1", + question="古洞北的設計是否能抵禦氣候變化?", + answer="研究顧問已為古洞北進行了評估。", + start_page=2, + end_page=3, + has_table=True, + parent_topic="排水系統", + ), + Section( + type="narrative", + heading="(1) 住戶的安置補償", + content="合資格住戶可選擇安置安排。", + start_page=3, + end_page=5, + has_table=False, + ), + Section( + type="speaking_notes", + heading="發言要點", + content="⚫ 要點一:政策方向\n⚫ 要點二:實施計劃", + start_page=1, + end_page=1, + has_table=False, + ), + Section( + type="toc", + heading="目錄", + content="Page 1 ... Page 2", + start_page=1, + end_page=1, + has_table=False, + ), + ] + chunks = build_chunks_from_sections(sections) + # Should have: 1 QA + 1 narrative + 2 speaking_notes bullets + 0 toc = 4 + assert len(chunks) >= 4 + + # First chunk: QA + qa_text, qa_page, qa_meta = chunks[0] + assert "古洞北" in qa_text + assert qa_page == 2 + assert qa_meta["section_type"] == "qa" + assert qa_meta["question_id"] == "A1" + assert qa_meta["question_index"] == 0 + assert qa_meta["answer_contains_table"] is True + assert qa_meta["section_heading"] == "(A) 排水系統" + + # Find the narrative chunk + narr_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "narrative"] + assert len(narr_chunks) == 1 + narr_text, narr_page, narr_meta = narr_chunks[0] + assert "住戶的安置補償" in narr_text + assert "合資格住戶" in narr_text + + # Find speaking_notes chunks + notes_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "speaking_notes"] + assert len(notes_chunks) == 2 + for t, p, m in notes_chunks: + assert "要點" in t + + # No TOC chunks + toc_chunks = [(t, p, m) for t, p, m in chunks if m.get("section_type") == "toc"] + assert len(toc_chunks) == 0 + + +# --------------------------------------------------------------------------- +# Test: preprocess_text +# --------------------------------------------------------------------------- + +class TestPreprocessText: + + def test_preprocess_text(self): + """Footer markers stripped, colons normalized, page breaks inserted.""" + pages = [ + (1, "Header\n(A) Section Title\nX-1\n2024-01-15"), + (2, "Content with:fullwidth colon\nMore text:here"), + ] + result = preprocess_text(pages) + + # Should have page break markers + assert "[PAGE_BREAK: 1]" in result + assert "[PAGE_BREAK: 2]" in result + + # Fullwidth colons normalized to ASCII + assert ":" not in result + assert ":" in result + + # Page footer patterns should be stripped (X-1, dates like 2024-01-15) + assert "X-1" not in result + assert "2024-01-15" not in result + + +# --------------------------------------------------------------------------- +# Test: build_structure_detection_prompt +# --------------------------------------------------------------------------- + +class TestBuildPrompt: + + def test_build_structure_detection_prompt(self): + """Prompt contains key instructions for LLM classification.""" + text = "Sample document text [PAGE_BREAK: 1]" + prompt = build_structure_detection_prompt(text) + assert "Hong Kong Legislative Council" in prompt or "Legislative Council" in prompt + assert "qa" in prompt.lower() or "問" in prompt + assert "narrative" in prompt.lower() + assert "speaking_notes" in prompt.lower() or "speaking notes" in prompt.lower() + assert text in prompt diff --git a/backend/app/utils/chunking.py b/backend/app/utils/chunking.py index 8118bda..3bc7a45 100644 --- a/backend/app/utils/chunking.py +++ b/backend/app/utils/chunking.py @@ -6,8 +6,15 @@ token-based windows. """ from __future__ import annotations +import logging from abc import ABC, abstractmethod -from typing import List, Tuple +from typing import TYPE_CHECKING, List, Optional, Tuple + +if TYPE_CHECKING: + from app.core.config import Settings + from app.services.llm_client import LLMClient + +logger = logging.getLogger(__name__) class ChunkingStrategy(ABC): @@ -117,3 +124,111 @@ class TokenChunkingStrategy(ChunkingStrategy): results.append(("\n".join(parts), page_num)) return results + + +class QuestionChunkingStrategy(ChunkingStrategy): + """Chunk text by detecting Q&A structure using LLM and/or regex patterns. + + Designed for LegCo documents with explicit 問/答 or Q1/Q2 markers. + Falls back to section-based chunking for narrative-only documents. + """ + + def __init__( + self, + settings: "Settings", + llm_client: Optional["LLMClient"] = None, + ): + self._settings = settings + self._llm_client = llm_client + self._max_tokens = getattr(settings, "qa_max_chunk_tokens", 3000) + self._chunk_metadata: List[dict] = [] + + def chunk(self, text: str) -> List[str]: + """Split text into chunks using Q&A detection (for DOCX/TXT).""" + if not text or not text.strip(): + return [] + + from app.utils.qa_chunking import ( + split_chinese_qa, + split_english_qa, + build_chunks_from_sections, + Section, + ) + + sections = split_chinese_qa(text) + if not sections: + sections = split_english_qa(text) + + if not sections: + sections = [Section(type="narrative", content=text, start_page=1, end_page=1)] + + results = build_chunks_from_sections(sections, max_tokens=self._max_tokens) + self._chunk_metadata = [meta for _, _, meta in results] + return [chunk_text for chunk_text, _, _ in results] + + def chunk_pages( + self, pages: List[Tuple[int, str]], overlap_tokens: int = 0 + ) -> List[Tuple[str, int]]: + """Split page-segmented text using Q&A detection (for PDF). + + Returns list of (chunk_text, page_number) where page_number + references the question location for Q&A chunks. + """ + if not pages: + return [] + + from app.utils.qa_chunking import ( + preprocess_text, + split_chinese_qa, + split_english_qa, + build_chunks_from_sections, + parse_llm_structure_response, + build_structure_detection_prompt, + Section, + ) + + full_text = preprocess_text(pages) + + sections = split_chinese_qa(full_text) + if not sections: + sections = split_english_qa(full_text) + + if not sections and self._llm_client is not None: + import asyncio + prompt = build_structure_detection_prompt(full_text) + try: + loop = asyncio.get_event_loop() + if loop.is_running(): + sections = [] + else: + response = loop.run_until_complete( + self._llm_client.complete(prompt, temperature=0.3, step_name="StructureDetection") + ) + sections = parse_llm_structure_response(response) + except Exception: + logger.warning("LLM structure detection failed, using fallback", exc_info=True) + + if not sections: + sections = [Section(type="narrative", content=full_text, start_page=1, end_page=len(pages))] + + results = build_chunks_from_sections(sections, max_tokens=self._max_tokens) + self._chunk_metadata = [meta for _, _, meta in results] + return [(chunk_text, page_num) for chunk_text, page_num, _ in results] + + +def get_chunking_strategy(name: str, settings: "Settings") -> ChunkingStrategy: + """Factory: return the named chunking strategy. + + Args: + name: "token" or "question" + settings: Application settings instance. + + Returns: + ChunkingStrategy instance. + """ + if name == "question": + return QuestionChunkingStrategy(settings=settings) + return TokenChunkingStrategy( + chunk_size=settings.chunk_size, + overlap=settings.chunk_overlap, + ) diff --git a/backend/app/utils/metadata.py b/backend/app/utils/metadata.py index e6cb538..5fcf478 100644 --- a/backend/app/utils/metadata.py +++ b/backend/app/utils/metadata.py @@ -12,6 +12,8 @@ def extract_metadata( page_numbers: List[int | None] | None = None, chunk_file_paths: List[str | None] | None = None, document_id: str | None = None, + strategy_type: str = "token", + chunk_metadata: List[Dict[str, Any]] | None = None, ) -> List[Dict[str, Any]]: """Extract metadata for a list of text chunks. @@ -23,6 +25,10 @@ def extract_metadata( - chunk_file_path: path to the per-chunk source file - document_id: unique identifier linking all chunks to the same document + Package 8 Q&A fields (present when chunk_metadata provided): + - strategy_type, section_type, question_index, question_id, question_text, + section_heading, answer_contains_table, source_page_range, parent_topic + Args: file_path: Path to the file associated with the chunks. chunks: List of string chunks to generate metadata for. @@ -31,6 +37,12 @@ def extract_metadata( page_numbers: Optional per-chunk page numbers. Length must match chunks. chunk_file_paths: Optional per-chunk source file paths. Length must match chunks. document_id: Optional unique document identifier applied to all chunks. + strategy_type: Chunking strategy used ("token" or "question"). Stored in + each chunk's metadata. + chunk_metadata: Optional per-chunk metadata dicts from Q&A strategy. + Each dict is merged into the corresponding base metadata entry. + Length must match chunks. Fields like question_id, question_index, + section_type, etc. are forwarded to ChromaDB metadata. Returns: A list of metadata dictionaries, one per chunk. Empty list if chunks is empty. @@ -55,6 +67,11 @@ def extract_metadata( f"chunk_file_paths length ({len(chunk_file_paths)}) does not match chunks length ({len(chunks)})" ) + if chunk_metadata is not None and len(chunk_metadata) > 0 and len(chunk_metadata) != len(chunks): + raise ValueError( + f"chunk_metadata length ({len(chunk_metadata)}) does not match chunks length ({len(chunks)})" + ) + filename = original_filename if original_filename else os.path.basename(file_path) upload_date = datetime.now().isoformat() @@ -68,6 +85,7 @@ def extract_metadata( "content_summary": content_summary, "chunk_index": idx, "document_id": document_id, + "strategy_type": strategy_type, } page_num = page_numbers[idx] if page_numbers else None if page_num is not None: @@ -75,6 +93,8 @@ def extract_metadata( cfp = chunk_file_paths[idx] if chunk_file_paths else None if cfp is not None: entry["chunk_file_path"] = cfp + if chunk_metadata: + entry.update(chunk_metadata[idx]) metadata.append(entry) return metadata diff --git a/backend/app/utils/qa_chunking.py b/backend/app/utils/qa_chunking.py new file mode 100644 index 0000000..c0f4f72 --- /dev/null +++ b/backend/app/utils/qa_chunking.py @@ -0,0 +1,361 @@ +"""Q&A-pair chunking utilities for Package 8. + +Provides section detection (LLM + regex), text preprocessing, +and chunk building for LegCo documents with Q&A structure. +""" +from __future__ import annotations + +import json +import logging +import re +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Tuple + +logger = logging.getLogger(__name__) + + +@dataclass +class Section: + """A detected section within a LegCo document.""" + type: str # "qa" | "narrative" | "speaking_notes" | "table" | "toc" | "heading_only" + heading: str = "" + qa_id: Optional[str] = None + question: Optional[str] = None + answer: Optional[str] = None + content: str = "" + start_page: int = 1 + end_page: int = 1 + has_table: bool = False + parent_topic: str = "" + + +_FOOTER_RE = re.compile(r"^[A-Z]-\d+\s*$", re.MULTILINE) +_FOOTER_DATE_RE = re.compile(r"^[A-Z]-\d+\s*\n\d{4}-\d{2}-\d{2}$", re.MULTILINE) +_HEADER_LETTER_RE = re.compile(r"^(\([A-Z]\))\s*$", re.MULTILINE) +_FULLWIDTH_COLON_RE = re.compile("[︰:]") + + +def preprocess_text(pages: List[Tuple[int, str]]) -> str: + """Concatenate pages, strip footers/headers, normalize colons, insert [PAGE_BREAK: N] markers.""" + parts: List[str] = [] + for idx, (page_num, page_text) in enumerate(pages): + text = _FOOTER_DATE_RE.sub("", page_text) + text = _FOOTER_RE.sub("", text) + if idx > 0: + text = _HEADER_LETTER_RE.sub("", text) + text = _FULLWIDTH_COLON_RE.sub(":", text) + parts.append(f"[PAGE_BREAK: {page_num}]\n{text}") + return "\n".join(parts) + + +_STRUCTURE_PROMPT_TEMPLATE = """You are analyzing a Hong Kong Legislative Council document. +The text has page markers like [PAGE_BREAK: N] showing where pages begin. + +For each distinct section in this document, identify: +1. The section type: + - "qa": a question-and-answer pair (問/答 or Q1/Q2 format) + - "narrative": policy text, explanatory paragraphs, section content with bullets + - "speaking_notes": briefing points (發言要點) with bullet markers + - "table": standalone data tables (not embedded in answers) + - "toc": table of contents + - "heading_only": a section heading with no following content + +2. For "qa" sections: + - The question text (exact) + - The answer text (exact, including tables, bullet lists, and [內部參考] content) + - The question ID if present (e.g. "A1", "Q3") + - The start page and end page + +3. For all sections: + - The section heading (e.g. "(A) 排水系統", "(1) 住戶的安置補償") + - The start page and end page + - Whether the section contains tables + +Return JSON: +{{ + "sections": [ + {{ + "type": "qa", + "heading": "(A) 排水系統", + "qa_id": "A1", + "question": "...", + "answer": "...", + "start_page": 2, + "end_page": 3, + "has_table": true, + "parent_topic": "排水系統" + }}, + {{ + "type": "narrative", + "heading": "(1) 住戶的安置補償", + "content": "...", + "start_page": 2, + "end_page": 5, + "has_table": false + }} + ] +}} + +DOCUMENT TEXT: +{document_text}""" + + +def build_structure_detection_prompt(text: str) -> str: + """Construct the LLM prompt for section classification.""" + return _STRUCTURE_PROMPT_TEMPLATE.format(document_text=text) + + +_MARKDOWN_FENCE_RE = re.compile(r"```(?:json)?\s*\n?(.*?)\n?```", re.DOTALL) + + +def parse_llm_structure_response(response_text: str) -> List[Section]: + """Parse the JSON returned by the LLM. Handle markdown code fences. + + Raises ValueError if response is not valid JSON. + """ + cleaned = response_text.strip() + fence_match = _MARKDOWN_FENCE_RE.search(cleaned) + if fence_match: + cleaned = fence_match.group(1).strip() + + try: + data = json.loads(cleaned) + except json.JSONDecodeError as exc: + raise ValueError(f"Invalid JSON from LLM structure detection: {exc}") from exc + + sections_raw = data.get("sections", []) + sections: List[Section] = [] + for raw in sections_raw: + sections.append(Section( + type=raw.get("type", "narrative"), + heading=raw.get("heading", ""), + qa_id=raw.get("qa_id"), + question=raw.get("question"), + answer=raw.get("answer"), + content=raw.get("content", ""), + start_page=raw.get("start_page", 1), + end_page=raw.get("end_page", 1), + has_table=raw.get("has_table", False), + parent_topic=raw.get("parent_topic", ""), + )) + return sections + + +_CN_QA_RE = re.compile( + r"問\s*([A-Z]\d+)\s*[︰::]\s*(.*?)\s*" + r"(?:\n\s*答\s*\1\s*[︰::]\s*(.*?)\s*)" + r"(?=\n\s*(?:問\s*[A-Z]\d+\s*[︰::]|$))", + re.DOTALL, +) + + +def split_chinese_qa(text: str) -> List[Section]: + """Regex fast-pass for 問/答 format. Returns empty list if no matches found.""" + sections: List[Section] = [] + for m in _CN_QA_RE.finditer(text): + qa_id = m.group(1) + question = m.group(2).strip() + answer = (m.group(3) or "").strip() + sections.append(Section( + type="qa", + qa_id=qa_id, + question=question, + answer=answer, + )) + return sections + + +_EN_QA_RE = re.compile( + r"^(Q\d+)\s+(.*?)\s*$\n((?:(?!^Q\d+).+(?:\n|$))*)", + re.MULTILINE, +) + + +def split_english_qa(text: str) -> List[Section]: + """Regex fast-pass for Q-number format. Returns empty list if no matches found.""" + sections: List[Section] = [] + for m in _EN_QA_RE.finditer(text): + qa_id = m.group(1) + question = m.group(2).strip() + answer = m.group(3).strip() + sections.append(Section( + type="qa", + qa_id=qa_id, + question=question, + answer=answer, + )) + return sections + + +def _estimate_tokens(text: str) -> int: + """Rough token estimate: ~1.3 tokens per CJK char, ~1 token per 4 chars for Latin.""" + cjk_count = 0 + latin_len = 0 + for ch in text: + if "\u4e00" <= ch <= "\u9fff": + cjk_count += 1 + else: + latin_len += 1 + return int(cjk_count * 1.3 + latin_len / 4) + + +def _split_oversized_qa( + question: str, answer: str, page: int, heading: str, + qa_id: Optional[str], question_index: int, has_table: bool, + parent_topic: str, start_page: int, end_page: int, + max_tokens: int, +) -> List[Tuple[str, int, dict]]: + """Recursively split an oversized Q&A answer with question prepended to each sub-chunk.""" + # Try paragraph boundaries first + parts = answer.split("\n\n") + if len(parts) <= 1: + parts = answer.split("\n") + + # Group parts into sub-chunks that fit within max_tokens + sub_chunks: List[str] = [] + current = "" + for part in parts: + candidate = (current + "\n\n" + part) if current else part + if _estimate_tokens(f"Question: {question}\n\nAnswer (part 1/N): {candidate}") > max_tokens and current: + sub_chunks.append(current) + current = part + else: + current = candidate + if current: + sub_chunks.append(current) + + total = len(sub_chunks) + results: List[Tuple[str, int, dict]] = [] + for i, sub in enumerate(sub_chunks): + chunk_text = f"Question: {question}\n\nAnswer (part {i + 1}/{total}): {sub}" + meta = { + "strategy_type": "question", + "section_type": "qa", + "question_index": question_index, + "question_id": qa_id, + "question_text": question, + "section_heading": heading, + "answer_contains_table": has_table, + "source_page_range": [start_page, end_page], + "parent_topic": parent_topic, + } + results.append((chunk_text, page, meta)) + return results + + +def build_chunks_from_sections( + sections: List[Section], max_tokens: int = 3000, +) -> List[Tuple[str, int, dict]]: + """Build chunk texts + page refs + metadata from sections. + + Returns List[(chunk_text, page_number, metadata_dict)]. + """ + chunks: List[Tuple[str, int, dict]] = [] + qa_index = 0 + + for section in sections: + if section.type in ("toc", "heading_only"): + continue + + if section.type == "qa": + question_text = section.question or "" + answer_text = section.answer or "" + chunk_text = f"Question: {question_text}\n\nAnswer: {answer_text}" + + if section.heading: + chunk_text = f"[{section.heading}]\n{chunk_text}" + + page = section.start_page + meta: Dict = { + "strategy_type": "question", + "section_type": "qa", + "question_index": qa_index, + "question_id": section.qa_id, + "question_text": question_text, + "section_heading": section.heading, + "answer_contains_table": section.has_table, + "source_page_range": [section.start_page, section.end_page], + "parent_topic": section.parent_topic, + } + + if _estimate_tokens(chunk_text) > max_tokens: + chunks.extend(_split_oversized_qa( + question=question_text, + answer=answer_text, + page=page, + heading=section.heading, + qa_id=section.qa_id, + question_index=qa_index, + has_table=section.has_table, + parent_topic=section.parent_topic, + start_page=section.start_page, + end_page=section.end_page, + max_tokens=max_tokens, + )) + else: + chunks.append((chunk_text, page, meta)) + + qa_index += 1 + + elif section.type == "narrative": + content = section.content + if not content.strip(): + continue + prefix = f"[{section.heading}]\n" if section.heading else "" + chunk_text = f"{prefix}{content}" + meta = { + "strategy_type": "question", + "section_type": "narrative", + "section_heading": section.heading, + "source_page_range": [section.start_page, section.end_page], + } + if _estimate_tokens(chunk_text) <= max_tokens: + chunks.append((chunk_text, section.start_page, meta)) + else: + paragraphs = content.split("\n\n") + current = "" + for para in paragraphs: + candidate = (current + "\n\n" + para) if current else para + full = f"{prefix}{candidate}" + if _estimate_tokens(full) > max_tokens and current: + chunks.append((f"{prefix}{current}", section.start_page, dict(meta))) + current = para + else: + current = candidate + if current: + chunks.append((f"{prefix}{current}", section.start_page, dict(meta))) + + elif section.type == "speaking_notes": + content = section.content + if not content.strip(): + continue + bullets = re.split(r"(?=⚫)", content) + bullets = [b.strip() for b in bullets if b.strip()] + if not bullets: + bullets = [content] + prefix = f"[{section.heading}]\n" if section.heading else "" + for bullet in bullets: + chunk_text = f"{prefix}{bullet}" + meta = { + "strategy_type": "question", + "section_type": "speaking_notes", + "section_heading": section.heading, + "source_page_range": [section.start_page, section.end_page], + } + chunks.append((chunk_text, section.start_page, meta)) + + elif section.type == "table": + content = section.content + if not content.strip(): + continue + chunk_text = f"[{section.heading}]\n{content}" if section.heading else content + meta = { + "strategy_type": "question", + "section_type": "table", + "section_heading": section.heading, + "answer_contains_table": True, + "source_page_range": [section.start_page, section.end_page], + } + chunks.append((chunk_text, section.start_page, meta)) + + return chunks diff --git a/backend/app/utils/table_extraction.py b/backend/app/utils/table_extraction.py new file mode 100644 index 0000000..64d6714 --- /dev/null +++ b/backend/app/utils/table_extraction.py @@ -0,0 +1,147 @@ +"""Table extraction utilities for Package 8. + +Provides vision-based and text-based table detection and markdown conversion +for LegCo documents. Uses the existing LLM model (vision-capable) for +table-to-markdown conversion. +""" +from __future__ import annotations + +import hashlib +import json +import logging +import os +from pathlib import Path +from typing import List, Optional + +logger = logging.getLogger(__name__) + +_CACHE_DIR = Path(__file__).resolve().parent.parent.parent / ".cache" / "vision_tables" + + +async def extract_tables_vision(page_images: List[str], llm_client) -> List[str]: + """Send page images to vision LLM, get back markdown tables. + + Each page_image is a base64-encoded PNG string. + Uses the existing LLM model which supports vision input. + """ + results: List[str] = [] + prompt = ( + "Convert this page to Markdown. For any tables:\n" + "- Use proper markdown table syntax with |---|---| alignment\n" + "- Preserve all column headers and row labels\n" + "- Do not modify or translate the content\n" + "- If a table spans multiple pages, note it" + ) + for idx, img_b64 in enumerate(page_images): + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{img_b64}"}, + }, + ], + } + ] + try: + response = await llm_client._client.chat.completions.create( + model=llm_client.model, + messages=messages, + temperature=0.1, + ) + content = response.choices[0].message.content or "" + if content.strip(): + results.append(content.strip()) + except Exception: + logger.warning("Vision table extraction failed for page image %d", idx, exc_info=True) + return results + + +_TABLE_HEURISTIC_RE = [ + r"(?:\|[\s\-:]+\|)", + r"(?:\+[-=]+\+)", + r"(?:(?:\S+\s{2,}){3,}\n)", +] + +_TABLE_REGION_PROMPT = ( + "Convert this raw table text extracted from a PDF into a markdown table.\n" + "Preserve all data exactly. Detect column boundaries and alignment.\n\n" + "{table_text}" +) + + +async def extract_tables_text(text: str, llm_client) -> List[str]: + """Detect table-like text regions, send to LLM for markdown conversion.""" + import re + + regions: List[str] = [] + lines = text.split("\n") + current_region: List[str] = [] + in_table = False + + for line in lines: + is_table_line = any(re.search(pat, line) for pat in _TABLE_HEURISTIC_RE) + if is_table_line: + in_table = True + current_region.append(line) + elif in_table and line.strip(): + current_region.append(line) + else: + if len(current_region) >= 3: + regions.append("\n".join(current_region)) + current_region = [] + in_table = False + + if len(current_region) >= 3: + regions.append("\n".join(current_region)) + + if not regions: + return [] + + results: List[str] = [] + for region in regions: + prompt = _TABLE_REGION_PROMPT.format(table_text=region) + try: + response = await llm_client.complete(prompt, temperature=0.1, step_name="TableExtraction") + if response.strip(): + results.append(response.strip()) + except Exception: + logger.warning("Text-based table extraction failed", exc_info=True) + return results + + +def inject_tables_into_answer(answer: str, tables_md: List[str]) -> str: + """Replace raw table text regions in answer with markdown tables.""" + if not tables_md: + return answer + result = answer + for table_md in tables_md: + lines = table_md.split("\n") + if not lines: + continue + header_line = lines[0] + if header_line.strip() in result: + result = result.replace(header_line.strip(), table_md) + return result + + +def cache_vision_result(page_hash: str) -> Optional[str]: + """Simple disk cache: hash→markdown stored in .cache dir. Returns None on miss.""" + cache_file = _CACHE_DIR / f"{page_hash}.md" + if cache_file.exists(): + return cache_file.read_text(encoding="utf-8") + return None + + +def save_vision_result(page_hash: str, markdown: str) -> None: + """Save a vision result to the disk cache.""" + _CACHE_DIR.mkdir(parents=True, exist_ok=True) + cache_file = _CACHE_DIR / f"{page_hash}.md" + cache_file.write_text(markdown, encoding="utf-8") + + +def compute_page_hash(page_image_b64: str) -> str: + """Compute a hash for a page image for cache key purposes.""" + return hashlib.sha256(page_image_b64.encode("utf-8")).hexdigest()[:16] diff --git a/frontend/src/components/ChunkList.tsx b/frontend/src/components/ChunkList.tsx index 4d9faee..2ab0b39 100644 --- a/frontend/src/components/ChunkList.tsx +++ b/frontend/src/components/ChunkList.tsx @@ -56,9 +56,32 @@ export const ChunkList: React.FC = ({ Chunk {chunk.chunk_index} - - Page: {chunk.page_number !== null ? chunk.page_number : 'N/A'} - + {chunk.strategy_type === 'question' && chunk.question_id ? ( + <> + + Q: {chunk.question_id}{chunk.question_text ? ` — ${chunk.question_text.length > 80 ? `${chunk.question_text.slice(0, 80)}...` : chunk.question_text}` : ''} + + {chunk.topic_section && ( + + Topic: {chunk.topic_section} + + )} + {chunk.source_page_range && chunk.source_page_range.length === 2 && ( + + Pages {chunk.source_page_range[0]}-{chunk.source_page_range[1]} + + )} + {chunk.has_table && ( + + Contains table + + )} + + ) : ( + + Page: {chunk.page_number !== null ? chunk.page_number : 'N/A'} + + )}
{chunk.content_summary.length > 100 @@ -67,7 +90,12 @@ export const ChunkList: React.FC = ({
{chunk.chunk_file_path && ( 0 + ? chunk.source_page_range[0] + : chunk.page_number ?? undefined + )} target="_blank" rel="noopener noreferrer" className="inline-flex items-center mt-1 text-xs text-blue-600 hover:text-blue-800 hover:underline" diff --git a/frontend/src/components/DocumentList.tsx b/frontend/src/components/DocumentList.tsx index f257c86..ec98c49 100644 --- a/frontend/src/components/DocumentList.tsx +++ b/frontend/src/components/DocumentList.tsx @@ -29,7 +29,18 @@ export const DocumentList: React.FC = ({
-
{doc.filename}
+
+ {doc.filename} + {doc.chunking_strategy === 'question' ? ( + + chunked by question + + ) : ( + + chunked by token + + )} +
{doc.chunk_count} chunks • Uploaded {doc.upload_date}
diff --git a/frontend/src/lib/api.ts b/frontend/src/lib/api.ts index 39ad983..65f09dd 100644 --- a/frontend/src/lib/api.ts +++ b/frontend/src/lib/api.ts @@ -1,5 +1,5 @@ import axios from 'axios' -import type { QueryRequest, QueryResponse, QueryStreamEvent, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, FullTranscriptResponse, VideoUploadResponse } from '../types' +import type { ChunkingStrategy, QueryRequest, QueryResponse, QueryStreamEvent, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, FullTranscriptResponse, VideoUploadResponse } from '../types' const BASE_URL: string = import.meta.env.VITE_API_BASE_URL ?? 'http://localhost:8000/api/v1' @@ -48,10 +48,10 @@ export const queryDocumentStream = async ( } } -export const ingestDocument = async (file: File): Promise => { +export const ingestDocument = async (file: File, strategy: ChunkingStrategy = 'token'): Promise => { const form = new FormData() form.append('file', file) - const resp = await apiClient.post('/ingest', form, { + const resp = await apiClient.post(`/ingest?strategy=${strategy}`, form, { headers: { 'Content-Type': 'multipart/form-data' }, }) return resp.data diff --git a/frontend/src/lib/queries.tsx b/frontend/src/lib/queries.tsx index 27ba71b..bf02227 100644 --- a/frontend/src/lib/queries.tsx +++ b/frontend/src/lib/queries.tsx @@ -1,7 +1,7 @@ import React from 'react' import { QueryClient, QueryClientProvider, useMutation, useQuery, useQueryClient } from '@tanstack/react-query' import { queryDocument, queryDocumentStream, ingestDocument, listDocuments, listChunks, deleteDocument, deleteChunk, listPromptProfiles, getPromptProfile, activatePromptProfile, updatePrompt, updateAllPrompts, resetPrompts, exportProfile, importProfile, listQueryHistory, getQueryHistoryDetail, deleteQueryHistory, clearQueryHistory, getHistoryStats, uploadVideo } from './api' -import type { QueryRequest, QueryResponse, QueryStreamEvent, SourceMetadata, SubQuestionSources, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, VideoUploadResponse } from '../types' +import type { QueryRequest, QueryResponse, QueryStreamEvent, SourceMetadata, SubQuestionSources, ChunkingStrategy, IngestResponse, DocumentListResponse, ChunkInfo, DeleteResponse, PromptProfileListResponse, PromptSetResponse, PromptUpdateRequest, PromptBatchUpdateRequest, PromptActivateResponse, PromptStatusResponse, ProfileExportData, ProfileImportResponse, QueryHistoryList, QueryHistoryDetail, HistoryStats, HistoryDeleteResponse, VideoUploadResponse } from '../types' import { useState, useCallback, useRef } from 'react' export const queryClient = new QueryClient() @@ -185,8 +185,8 @@ export const useQueryDocumentStream = () => { } export const useIngestDocument = () => { - return useMutation({ - mutationFn: ingestDocument, + return useMutation({ + mutationFn: ({ file, strategy }) => ingestDocument(file, strategy), }) } diff --git a/frontend/src/pages/RAGDatabasePage.tsx b/frontend/src/pages/RAGDatabasePage.tsx index 5cba000..f9e5dff 100644 --- a/frontend/src/pages/RAGDatabasePage.tsx +++ b/frontend/src/pages/RAGDatabasePage.tsx @@ -1,10 +1,11 @@ import React, { useState, useCallback, useMemo } from 'react' -import { Database, AlertCircle, CheckCircle, XCircle, Loader2 } from 'lucide-react' +import { Database, AlertCircle, CheckCircle, XCircle, Loader2, Type, MessageSquare } from 'lucide-react' import { useQueryClient } from '@tanstack/react-query' import { useDocuments, useDocumentChunks, useDeleteDocument, useDeleteChunk, useIngestDocument } from '../lib/queries' import { DocumentList } from '../components/DocumentList' import { ChunkList } from '../components/ChunkList' import { DocumentUpload } from '../components/DocumentUpload' +import type { ChunkingStrategy } from '../types' interface FileUploadEntry { name: string @@ -22,6 +23,7 @@ export const RAGDatabasePage: React.FC = () => { const initialDocId = useMemo(() => getDocumentIdFromUrl(), []) const [expandedId, setExpandedId] = useState(initialDocId) const [uploadEntries, setUploadEntries] = useState([]) + const [chunkingStrategy, setChunkingStrategy] = useState('token') const { data: documentsData, isLoading: isLoadingDocuments, error: documentsError } = useDocuments() const { data: chunks, isLoading: isLoadingChunks } = useDocumentChunks(expandedId) @@ -60,7 +62,7 @@ export const RAGDatabasePage: React.FC = () => { const results = await Promise.allSettled( files.map(async (file) => { try { - await ingestDocumentMutation.mutateAsync(file) + await ingestDocumentMutation.mutateAsync({ file, strategy: chunkingStrategy }) setUploadEntries((prev) => prev.map((e) => e.name === file.name ? { ...e, status: 'success' as const } : e @@ -80,7 +82,7 @@ export const RAGDatabasePage: React.FC = () => { queryClient.invalidateQueries({ queryKey: ['documents'] }) setTimeout(() => setUploadEntries([]), 5000) - }, [ingestDocumentMutation, queryClient]) + }, [ingestDocumentMutation, queryClient, chunkingStrategy]) const uploadingCount = uploadEntries.filter((e) => e.status === 'uploading').length const successCount = uploadEntries.filter((e) => e.status === 'success').length @@ -127,6 +129,42 @@ export const RAGDatabasePage: React.FC = () => { />
+
+ Chunking strategy: +
+ + +
+
+ {hasEntries && (