diff --git a/backend/app/routers/documents.py b/backend/app/routers/documents.py index b79f085..c1996c4 100644 --- a/backend/app/routers/documents.py +++ b/backend/app/routers/documents.py @@ -32,6 +32,7 @@ async def list_documents(): filename=d["filename"], chunk_count=d["chunk_count"], upload_date=d["upload_date"], + chunking_strategy=d.get("chunking_strategy", "token"), ) for d in doc_list ] @@ -59,6 +60,14 @@ async def list_chunks(document_id: str): content_summary=c["content_summary"], page_number=c.get("page_number"), chunk_file_path=c.get("chunk_file_path"), + strategy_type=c.get("strategy_type"), + question_index=c.get("question_index"), + question_id=c.get("question_id"), + question_text=c.get("question_text"), + section_heading=c.get("section_heading"), + answer_contains_table=c.get("answer_contains_table"), + source_page_range=c.get("source_page_range"), + parent_topic=c.get("parent_topic"), ) for c in chunks ] diff --git a/backend/app/routers/ingest.py b/backend/app/routers/ingest.py index 163547d..cc8f396 100644 --- a/backend/app/routers/ingest.py +++ b/backend/app/routers/ingest.py @@ -90,7 +90,7 @@ async def ingest_document( detail="Document appears to be empty or could not be parsed", ) - chunked = chunker.chunk_pages(pages, overlap_tokens=settings.chunk_overlap) + chunked = await chunker.chunk_pages(pages, overlap_tokens=settings.chunk_overlap) chunk_texts = [text for text, _ in chunked] page_numbers = [pn for _, pn in chunked] diff --git a/backend/app/services/rag.py b/backend/app/services/rag.py index 2d0f70a..d1abad1 100644 --- a/backend/app/services/rag.py +++ b/backend/app/services/rag.py @@ -280,7 +280,7 @@ class RAGService: if not all_data["metadatas"]: return [], 0, 0 - docs = defaultdict(lambda: {"filename": "", "chunk_count": 0, "upload_date": ""}) + docs = defaultdict(lambda: {"filename": "", "chunk_count": 0, "upload_date": "", "chunking_strategy": "token"}) for chunk_id, meta in zip(all_data["ids"], all_data["metadatas"]): parts = chunk_id.rsplit("_", 1) @@ -289,6 +289,8 @@ class RAGService: docs[doc_id]["filename"] = meta.get("filename", "unknown") docs[doc_id]["chunk_count"] += 1 docs[doc_id]["upload_date"] = meta.get("upload_date", "") + if meta.get("strategy_type") == "question": + docs[doc_id]["chunking_strategy"] = "question" total_chunks = sum(d["chunk_count"] for d in docs.values()) doc_list = [ @@ -297,6 +299,7 @@ class RAGService: "filename": info["filename"], "chunk_count": info["chunk_count"], "upload_date": info["upload_date"], + "chunking_strategy": info["chunking_strategy"], } for doc_id, info in docs.items() ] @@ -315,6 +318,14 @@ class RAGService: "content_summary": meta.get("content_summary", ""), "page_number": meta.get("page_number"), "chunk_file_path": meta.get("chunk_file_path"), + "strategy_type": meta.get("strategy_type"), + "question_index": meta.get("question_index"), + "question_id": meta.get("question_id"), + "question_text": meta.get("question_text"), + "section_heading": meta.get("section_heading"), + "answer_contains_table": meta.get("answer_contains_table"), + "source_page_range": meta.get("source_page_range"), + "parent_topic": meta.get("parent_topic"), }) chunks.sort(key=lambda x: x["chunk_index"]) diff --git a/backend/app/test/test_phase1_page_aware_chunking.py b/backend/app/test/test_phase1_page_aware_chunking.py index ff53886..b6e06b1 100644 --- a/backend/app/test/test_phase1_page_aware_chunking.py +++ b/backend/app/test/test_phase1_page_aware_chunking.py @@ -4,6 +4,7 @@ Tests for TokenChunkingStrategy.chunk_pages() which creates one chunk per page with overlap context from adjacent pages. """ +import asyncio import importlib.util from pathlib import Path import pytest @@ -45,7 +46,7 @@ def test_chunk_pages_basic(): (2, _long_text("beta")), (3, _long_text("gamma")), ] - result = strat.chunk_pages(pages) + result = asyncio.run(strat.chunk_pages(pages)) assert len(result) == 3 # Each result is (chunk_text, page_number) @@ -61,7 +62,7 @@ def test_chunk_pages_single_page(): strat = _make_strategy() text = _long_text("solo") pages = [(1, text)] - result = strat.chunk_pages(pages) + result = asyncio.run(strat.chunk_pages(pages)) assert len(result) == 1 chunk_text, page_num = result[0] @@ -79,7 +80,7 @@ def test_chunk_pages_first_page(): (2, _long_text("second")), (3, _long_text("third")), ] - result = strat.chunk_pages(pages) + result = asyncio.run(strat.chunk_pages(pages)) chunk_text, page_num = result[0] assert page_num == 1 @@ -97,7 +98,7 @@ def test_chunk_pages_last_page(): (2, _long_text("second")), (3, _long_text("third")), ] - result = strat.chunk_pages(pages) + result = asyncio.run(strat.chunk_pages(pages)) chunk_text, page_num = result[-1] assert page_num == 3 @@ -110,7 +111,7 @@ def test_chunk_pages_last_page(): def test_chunk_pages_empty_input(): """Empty list returns empty list.""" strat = _make_strategy() - result = strat.chunk_pages([]) + result = asyncio.run(strat.chunk_pages([])) assert result == [] @@ -126,7 +127,7 @@ def test_chunk_pages_overlap_content(): (2, _long_text("page_two")), (3, _long_text("page_three")), ] - result = strat.chunk_pages(pages) + result = asyncio.run(strat.chunk_pages(pages)) # Page 2 chunk should contain overlap from both neighbors middle_chunk, middle_page = result[1] @@ -156,7 +157,7 @@ def test_chunk_pages_returns_page_numbers(): (10, _long_text("ten")), (99, _long_text("ninety_nine")), ] - result = strat.chunk_pages(pages) + result = asyncio.run(strat.chunk_pages(pages)) assert len(result) == 3 output_pages = [pn for _, pn in result] @@ -171,7 +172,7 @@ def test_chunk_pages_custom_overlap(): (1, _long_text("aaa")), (2, _long_text("bbb")), ] - result = strat.chunk_pages(pages, overlap_tokens=5) + result = asyncio.run(strat.chunk_pages(pages, overlap_tokens=5)) assert len(result) == 2 # Both pages present @@ -183,7 +184,7 @@ def test_chunk_pages_custom_overlap(): assert "aaa" in result[1][0] # Verify with zero overlap - result_zero = strat.chunk_pages(pages, overlap_tokens=0) + result_zero = asyncio.run(strat.chunk_pages(pages, overlap_tokens=0)) # Page 1 chunk should NOT contain page 2 content assert "bbb" not in result_zero[0][0] # Page 2 chunk should NOT contain page 1 content @@ -194,7 +195,7 @@ def test_chunk_pages_output_format(): """Each result element is a (str, int) tuple.""" strat = _make_strategy() pages = [(1, "Short text one."), (2, "Short text two.")] - result = strat.chunk_pages(pages) + result = asyncio.run(strat.chunk_pages(pages)) for chunk_text, page_num in result: assert isinstance(chunk_text, str) diff --git a/backend/app/test/test_phase8_ingest.py b/backend/app/test/test_phase8_ingest.py index 8c3e892..b34aacb 100644 --- a/backend/app/test/test_phase8_ingest.py +++ b/backend/app/test/test_phase8_ingest.py @@ -199,7 +199,7 @@ def _mock_question_chunker(monkeypatch): self._chunk_metadata = self._chunk_metadata[:1] return ["Question: What is X?\n\nAnswer: X is Y."] - def chunk_pages(self, pages, overlap_tokens=0): + async def chunk_pages(self, pages, overlap_tokens=0): self._chunk_metadata = self._chunk_metadata[:1] return [("Question: What is X?\n\nAnswer: X is Y.", 1)] diff --git a/backend/app/utils/chunking.py b/backend/app/utils/chunking.py index 3bc7a45..fbd1a70 100644 --- a/backend/app/utils/chunking.py +++ b/backend/app/utils/chunking.py @@ -79,7 +79,7 @@ class TokenChunkingStrategy(ChunkingStrategy): return chunks - def chunk_pages( + async def chunk_pages( self, pages: List[Tuple[int, str]], overlap_tokens: int = 200 ) -> List[Tuple[str, int]]: """Chunk page-segmented text with overlap from adjacent pages. @@ -166,13 +166,16 @@ class QuestionChunkingStrategy(ChunkingStrategy): self._chunk_metadata = [meta for _, _, meta in results] return [chunk_text for chunk_text, _, _ in results] - def chunk_pages( + async def chunk_pages( self, pages: List[Tuple[int, str]], overlap_tokens: int = 0 ) -> List[Tuple[str, int]]: """Split page-segmented text using Q&A detection (for PDF). Returns list of (chunk_text, page_number) where page_number references the question location for Q&A chunks. + + When regex fast-pass fails and an LLM client is available, + calls the LLM for structure detection (async). """ if not pages: return [] @@ -194,17 +197,12 @@ class QuestionChunkingStrategy(ChunkingStrategy): sections = split_english_qa(full_text) if not sections and self._llm_client is not None: - import asyncio prompt = build_structure_detection_prompt(full_text) try: - loop = asyncio.get_event_loop() - if loop.is_running(): - sections = [] - else: - response = loop.run_until_complete( - self._llm_client.complete(prompt, temperature=0.3, step_name="StructureDetection") - ) - sections = parse_llm_structure_response(response) + response = await self._llm_client.complete( + prompt, temperature=0.3, step_name="StructureDetection" + ) + sections = parse_llm_structure_response(response) except Exception: logger.warning("LLM structure detection failed, using fallback", exc_info=True) @@ -227,7 +225,22 @@ def get_chunking_strategy(name: str, settings: "Settings") -> ChunkingStrategy: ChunkingStrategy instance. """ if name == "question": - return QuestionChunkingStrategy(settings=settings) + # Create llm_client if possible; fall back gracefully if config is incomplete. + llm_client = None + try: + from app.services.llm_client import LLMClient + client = LLMClient(settings=settings) + model = settings.qa_structure_model or settings.llm_model_name + if model and settings.llm_model_name: + client.model = model + llm_client = client + except Exception: + logger.warning( + "Could not create LLM client for Q&A chunking; " + "falling back to regex-only detection", + exc_info=True, + ) + return QuestionChunkingStrategy(settings=settings, llm_client=llm_client) return TokenChunkingStrategy( chunk_size=settings.chunk_size, overlap=settings.chunk_overlap,