Merge branch 'RAG-workflow'

2026-05-18 14:42:00 +08:00 · 2026-05-18 14:42:00 +08:00 · 821159a198
parent e00bb8853d 73c1789698
commit 821159a198
6 changed files with 59 additions and 25 deletions
--- a/backend/app/routers/documents.py
+++ b/backend/app/routers/documents.py
@ -32,6 +32,7 @@ async def list_documents():
            filename=d["filename"],
            chunk_count=d["chunk_count"],
            upload_date=d["upload_date"],
+            chunking_strategy=d.get("chunking_strategy", "token"),
        )
        for d in doc_list
    ]
@ -59,6 +60,14 @@ async def list_chunks(document_id: str):
            content_summary=c["content_summary"],
            page_number=c.get("page_number"),
            chunk_file_path=c.get("chunk_file_path"),
+            strategy_type=c.get("strategy_type"),
+            question_index=c.get("question_index"),
+            question_id=c.get("question_id"),
+            question_text=c.get("question_text"),
+            section_heading=c.get("section_heading"),
+            answer_contains_table=c.get("answer_contains_table"),
+            source_page_range=c.get("source_page_range"),
+            parent_topic=c.get("parent_topic"),
        )
        for c in chunks
    ]
--- a/backend/app/routers/ingest.py
+++ b/backend/app/routers/ingest.py
@ -90,7 +90,7 @@ async def ingest_document(
                    detail="Document appears to be empty or could not be parsed",
                )

-            chunked = chunker.chunk_pages(pages, overlap_tokens=settings.chunk_overlap)
+            chunked = await chunker.chunk_pages(pages, overlap_tokens=settings.chunk_overlap)
            chunk_texts = [text for text, _ in chunked]
            page_numbers = [pn for _, pn in chunked]

--- a/backend/app/services/rag.py
+++ b/backend/app/services/rag.py
@ -280,7 +280,7 @@ class RAGService:
        if not all_data["metadatas"]:
            return [], 0, 0

-        docs = defaultdict(lambda: {"filename": "", "chunk_count": 0, "upload_date": ""})
+        docs = defaultdict(lambda: {"filename": "", "chunk_count": 0, "upload_date": "", "chunking_strategy": "token"})

        for chunk_id, meta in zip(all_data["ids"], all_data["metadatas"]):
            parts = chunk_id.rsplit("_", 1)
@ -289,6 +289,8 @@ class RAGService:
            docs[doc_id]["filename"] = meta.get("filename", "unknown")
            docs[doc_id]["chunk_count"] += 1
            docs[doc_id]["upload_date"] = meta.get("upload_date", "")
+            if meta.get("strategy_type") == "question":
+                docs[doc_id]["chunking_strategy"] = "question"

        total_chunks = sum(d["chunk_count"] for d in docs.values())
        doc_list = [
@ -297,6 +299,7 @@ class RAGService:
                "filename": info["filename"],
                "chunk_count": info["chunk_count"],
                "upload_date": info["upload_date"],
+                "chunking_strategy": info["chunking_strategy"],
            }
            for doc_id, info in docs.items()
        ]
@ -315,6 +318,14 @@ class RAGService:
                    "content_summary": meta.get("content_summary", ""),
                    "page_number": meta.get("page_number"),
                    "chunk_file_path": meta.get("chunk_file_path"),
+                    "strategy_type": meta.get("strategy_type"),
+                    "question_index": meta.get("question_index"),
+                    "question_id": meta.get("question_id"),
+                    "question_text": meta.get("question_text"),
+                    "section_heading": meta.get("section_heading"),
+                    "answer_contains_table": meta.get("answer_contains_table"),
+                    "source_page_range": meta.get("source_page_range"),
+                    "parent_topic": meta.get("parent_topic"),
                })

        chunks.sort(key=lambda x: x["chunk_index"])
--- a/backend/app/test/test_phase1_page_aware_chunking.py
+++ b/backend/app/test/test_phase1_page_aware_chunking.py
@ -4,6 +4,7 @@ Tests for TokenChunkingStrategy.chunk_pages() which creates one chunk per page
 with overlap context from adjacent pages.
 """

+import asyncio
 import importlib.util
 from pathlib import Path
 import pytest
@ -45,7 +46,7 @@ def test_chunk_pages_basic():
        (2, _long_text("beta")),
        (3, _long_text("gamma")),
    ]
-    result = strat.chunk_pages(pages)
+    result = asyncio.run(strat.chunk_pages(pages))

    assert len(result) == 3
    # Each result is (chunk_text, page_number)
@ -61,7 +62,7 @@ def test_chunk_pages_single_page():
    strat = _make_strategy()
    text = _long_text("solo")
    pages = [(1, text)]
-    result = strat.chunk_pages(pages)
+    result = asyncio.run(strat.chunk_pages(pages))

    assert len(result) == 1
    chunk_text, page_num = result[0]
@ -79,7 +80,7 @@ def test_chunk_pages_first_page():
        (2, _long_text("second")),
        (3, _long_text("third")),
    ]
-    result = strat.chunk_pages(pages)
+    result = asyncio.run(strat.chunk_pages(pages))

    chunk_text, page_num = result[0]
    assert page_num == 1
@ -97,7 +98,7 @@ def test_chunk_pages_last_page():
        (2, _long_text("second")),
        (3, _long_text("third")),
    ]
-    result = strat.chunk_pages(pages)
+    result = asyncio.run(strat.chunk_pages(pages))

    chunk_text, page_num = result[-1]
    assert page_num == 3
@ -110,7 +111,7 @@ def test_chunk_pages_last_page():
 def test_chunk_pages_empty_input():
    """Empty list returns empty list."""
    strat = _make_strategy()
-    result = strat.chunk_pages([])
+    result = asyncio.run(strat.chunk_pages([]))
    assert result == []


@ -126,7 +127,7 @@ def test_chunk_pages_overlap_content():
        (2, _long_text("page_two")),
        (3, _long_text("page_three")),
    ]
-    result = strat.chunk_pages(pages)
+    result = asyncio.run(strat.chunk_pages(pages))

    # Page 2 chunk should contain overlap from both neighbors
    middle_chunk, middle_page = result[1]
@ -156,7 +157,7 @@ def test_chunk_pages_returns_page_numbers():
        (10, _long_text("ten")),
        (99, _long_text("ninety_nine")),
    ]
-    result = strat.chunk_pages(pages)
+    result = asyncio.run(strat.chunk_pages(pages))

    assert len(result) == 3
    output_pages = [pn for _, pn in result]
@ -171,7 +172,7 @@ def test_chunk_pages_custom_overlap():
        (1, _long_text("aaa")),
        (2, _long_text("bbb")),
    ]
-    result = strat.chunk_pages(pages, overlap_tokens=5)
+    result = asyncio.run(strat.chunk_pages(pages, overlap_tokens=5))

    assert len(result) == 2
    # Both pages present
@ -183,7 +184,7 @@ def test_chunk_pages_custom_overlap():
    assert "aaa" in result[1][0]

    # Verify with zero overlap
-    result_zero = strat.chunk_pages(pages, overlap_tokens=0)
+    result_zero = asyncio.run(strat.chunk_pages(pages, overlap_tokens=0))
    # Page 1 chunk should NOT contain page 2 content
    assert "bbb" not in result_zero[0][0]
    # Page 2 chunk should NOT contain page 1 content
@ -194,7 +195,7 @@ def test_chunk_pages_output_format():
    """Each result element is a (str, int) tuple."""
    strat = _make_strategy()
    pages = [(1, "Short text one."), (2, "Short text two.")]
-    result = strat.chunk_pages(pages)
+    result = asyncio.run(strat.chunk_pages(pages))

    for chunk_text, page_num in result:
        assert isinstance(chunk_text, str)
--- a/backend/app/test/test_phase8_ingest.py
+++ b/backend/app/test/test_phase8_ingest.py
@ -199,7 +199,7 @@ def _mock_question_chunker(monkeypatch):
            self._chunk_metadata = self._chunk_metadata[:1]
            return ["Question: What is X?\n\nAnswer: X is Y."]

-        def chunk_pages(self, pages, overlap_tokens=0):
+        async def chunk_pages(self, pages, overlap_tokens=0):
            self._chunk_metadata = self._chunk_metadata[:1]
            return [("Question: What is X?\n\nAnswer: X is Y.", 1)]

--- a/backend/app/utils/chunking.py
+++ b/backend/app/utils/chunking.py
@ -79,7 +79,7 @@ class TokenChunkingStrategy(ChunkingStrategy):

        return chunks

-    def chunk_pages(
+    async def chunk_pages(
        self, pages: List[Tuple[int, str]], overlap_tokens: int = 200
    ) -> List[Tuple[str, int]]:
        """Chunk page-segmented text with overlap from adjacent pages.
@ -166,13 +166,16 @@ class QuestionChunkingStrategy(ChunkingStrategy):
        self._chunk_metadata = [meta for _, _, meta in results]
        return [chunk_text for chunk_text, _, _ in results]

-    def chunk_pages(
+    async def chunk_pages(
        self, pages: List[Tuple[int, str]], overlap_tokens: int = 0
    ) -> List[Tuple[str, int]]:
        """Split page-segmented text using Q&A detection (for PDF).

        Returns list of (chunk_text, page_number) where page_number
        references the question location for Q&A chunks.
+
+        When regex fast-pass fails and an LLM client is available,
+        calls the LLM for structure detection (async).
        """
        if not pages:
            return []
@ -194,17 +197,12 @@ class QuestionChunkingStrategy(ChunkingStrategy):
            sections = split_english_qa(full_text)

        if not sections and self._llm_client is not None:
-            import asyncio
            prompt = build_structure_detection_prompt(full_text)
            try:
-                loop = asyncio.get_event_loop()
-                if loop.is_running():
-                    sections = []
-                else:
-                    response = loop.run_until_complete(
-                        self._llm_client.complete(prompt, temperature=0.3, step_name="StructureDetection")
-                    )
-                    sections = parse_llm_structure_response(response)
+                response = await self._llm_client.complete(
+                    prompt, temperature=0.3, step_name="StructureDetection"
+                )
+                sections = parse_llm_structure_response(response)
            except Exception:
                logger.warning("LLM structure detection failed, using fallback", exc_info=True)

@ -227,7 +225,22 @@ def get_chunking_strategy(name: str, settings: "Settings") -> ChunkingStrategy:
        ChunkingStrategy instance.
    """
    if name == "question":
-        return QuestionChunkingStrategy(settings=settings)
+        # Create llm_client if possible; fall back gracefully if config is incomplete.
+        llm_client = None
+        try:
+            from app.services.llm_client import LLMClient
+            client = LLMClient(settings=settings)
+            model = settings.qa_structure_model or settings.llm_model_name
+            if model and settings.llm_model_name:
+                client.model = model
+                llm_client = client
+        except Exception:
+            logger.warning(
+                "Could not create LLM client for Q&A chunking; "
+                "falling back to regex-only detection",
+                exc_info=True,
+            )
+        return QuestionChunkingStrategy(settings=settings, llm_client=llm_client)
    return TokenChunkingStrategy(
        chunk_size=settings.chunk_size,
        overlap=settings.chunk_overlap,