fix: Q\&A chunking always fell back to token — LLM never called, missing API fields

Three bugs caused 'Chunk by Question' to silently produce token chunks:

1. QuestionChunkingStrategy.chunk_pages() had a broken event-loop check
   that always skipped LLM structure detection in FastAPI's async context.
   Fixed by making chunk_pages() async and removing the is_running() guard.

2. get_chunking_strategy() factory never passed an LLMClient to
   QuestionChunkingStrategy. Fixed by creating LLMClient in the factory
   with graceful fallback to regex-only when config is incomplete.

3. rag.list_documents() and list_chunks() didn't extract strategy_type
   or Q&A fields from ChromaDB metadata, so the frontend always showed
   chunking_strategy='token' and null Q&A fields. Fixed by reading
   these fields from ChromaDB and routing them through the API.

Also: TokenChunkingStrategy.chunk_pages() made async for consistency
with the question strategy; ingest router updated to await it.
Tests updated (asyncio.run() for sync tests, async mock chunk_pages).
This commit is contained in:
Woody 2026-05-15 14:46:45 +08:00
parent 9bef65de7b
commit 73c1789698
6 changed files with 59 additions and 25 deletions

View File

@ -32,6 +32,7 @@ async def list_documents():
filename=d["filename"], filename=d["filename"],
chunk_count=d["chunk_count"], chunk_count=d["chunk_count"],
upload_date=d["upload_date"], upload_date=d["upload_date"],
chunking_strategy=d.get("chunking_strategy", "token"),
) )
for d in doc_list for d in doc_list
] ]
@ -59,6 +60,14 @@ async def list_chunks(document_id: str):
content_summary=c["content_summary"], content_summary=c["content_summary"],
page_number=c.get("page_number"), page_number=c.get("page_number"),
chunk_file_path=c.get("chunk_file_path"), chunk_file_path=c.get("chunk_file_path"),
strategy_type=c.get("strategy_type"),
question_index=c.get("question_index"),
question_id=c.get("question_id"),
question_text=c.get("question_text"),
section_heading=c.get("section_heading"),
answer_contains_table=c.get("answer_contains_table"),
source_page_range=c.get("source_page_range"),
parent_topic=c.get("parent_topic"),
) )
for c in chunks for c in chunks
] ]

View File

@ -90,7 +90,7 @@ async def ingest_document(
detail="Document appears to be empty or could not be parsed", detail="Document appears to be empty or could not be parsed",
) )
chunked = chunker.chunk_pages(pages, overlap_tokens=settings.chunk_overlap) chunked = await chunker.chunk_pages(pages, overlap_tokens=settings.chunk_overlap)
chunk_texts = [text for text, _ in chunked] chunk_texts = [text for text, _ in chunked]
page_numbers = [pn for _, pn in chunked] page_numbers = [pn for _, pn in chunked]

View File

@ -280,7 +280,7 @@ class RAGService:
if not all_data["metadatas"]: if not all_data["metadatas"]:
return [], 0, 0 return [], 0, 0
docs = defaultdict(lambda: {"filename": "", "chunk_count": 0, "upload_date": ""}) docs = defaultdict(lambda: {"filename": "", "chunk_count": 0, "upload_date": "", "chunking_strategy": "token"})
for chunk_id, meta in zip(all_data["ids"], all_data["metadatas"]): for chunk_id, meta in zip(all_data["ids"], all_data["metadatas"]):
parts = chunk_id.rsplit("_", 1) parts = chunk_id.rsplit("_", 1)
@ -289,6 +289,8 @@ class RAGService:
docs[doc_id]["filename"] = meta.get("filename", "unknown") docs[doc_id]["filename"] = meta.get("filename", "unknown")
docs[doc_id]["chunk_count"] += 1 docs[doc_id]["chunk_count"] += 1
docs[doc_id]["upload_date"] = meta.get("upload_date", "") docs[doc_id]["upload_date"] = meta.get("upload_date", "")
if meta.get("strategy_type") == "question":
docs[doc_id]["chunking_strategy"] = "question"
total_chunks = sum(d["chunk_count"] for d in docs.values()) total_chunks = sum(d["chunk_count"] for d in docs.values())
doc_list = [ doc_list = [
@ -297,6 +299,7 @@ class RAGService:
"filename": info["filename"], "filename": info["filename"],
"chunk_count": info["chunk_count"], "chunk_count": info["chunk_count"],
"upload_date": info["upload_date"], "upload_date": info["upload_date"],
"chunking_strategy": info["chunking_strategy"],
} }
for doc_id, info in docs.items() for doc_id, info in docs.items()
] ]
@ -315,6 +318,14 @@ class RAGService:
"content_summary": meta.get("content_summary", ""), "content_summary": meta.get("content_summary", ""),
"page_number": meta.get("page_number"), "page_number": meta.get("page_number"),
"chunk_file_path": meta.get("chunk_file_path"), "chunk_file_path": meta.get("chunk_file_path"),
"strategy_type": meta.get("strategy_type"),
"question_index": meta.get("question_index"),
"question_id": meta.get("question_id"),
"question_text": meta.get("question_text"),
"section_heading": meta.get("section_heading"),
"answer_contains_table": meta.get("answer_contains_table"),
"source_page_range": meta.get("source_page_range"),
"parent_topic": meta.get("parent_topic"),
}) })
chunks.sort(key=lambda x: x["chunk_index"]) chunks.sort(key=lambda x: x["chunk_index"])

View File

@ -4,6 +4,7 @@ Tests for TokenChunkingStrategy.chunk_pages() which creates one chunk per page
with overlap context from adjacent pages. with overlap context from adjacent pages.
""" """
import asyncio
import importlib.util import importlib.util
from pathlib import Path from pathlib import Path
import pytest import pytest
@ -45,7 +46,7 @@ def test_chunk_pages_basic():
(2, _long_text("beta")), (2, _long_text("beta")),
(3, _long_text("gamma")), (3, _long_text("gamma")),
] ]
result = strat.chunk_pages(pages) result = asyncio.run(strat.chunk_pages(pages))
assert len(result) == 3 assert len(result) == 3
# Each result is (chunk_text, page_number) # Each result is (chunk_text, page_number)
@ -61,7 +62,7 @@ def test_chunk_pages_single_page():
strat = _make_strategy() strat = _make_strategy()
text = _long_text("solo") text = _long_text("solo")
pages = [(1, text)] pages = [(1, text)]
result = strat.chunk_pages(pages) result = asyncio.run(strat.chunk_pages(pages))
assert len(result) == 1 assert len(result) == 1
chunk_text, page_num = result[0] chunk_text, page_num = result[0]
@ -79,7 +80,7 @@ def test_chunk_pages_first_page():
(2, _long_text("second")), (2, _long_text("second")),
(3, _long_text("third")), (3, _long_text("third")),
] ]
result = strat.chunk_pages(pages) result = asyncio.run(strat.chunk_pages(pages))
chunk_text, page_num = result[0] chunk_text, page_num = result[0]
assert page_num == 1 assert page_num == 1
@ -97,7 +98,7 @@ def test_chunk_pages_last_page():
(2, _long_text("second")), (2, _long_text("second")),
(3, _long_text("third")), (3, _long_text("third")),
] ]
result = strat.chunk_pages(pages) result = asyncio.run(strat.chunk_pages(pages))
chunk_text, page_num = result[-1] chunk_text, page_num = result[-1]
assert page_num == 3 assert page_num == 3
@ -110,7 +111,7 @@ def test_chunk_pages_last_page():
def test_chunk_pages_empty_input(): def test_chunk_pages_empty_input():
"""Empty list returns empty list.""" """Empty list returns empty list."""
strat = _make_strategy() strat = _make_strategy()
result = strat.chunk_pages([]) result = asyncio.run(strat.chunk_pages([]))
assert result == [] assert result == []
@ -126,7 +127,7 @@ def test_chunk_pages_overlap_content():
(2, _long_text("page_two")), (2, _long_text("page_two")),
(3, _long_text("page_three")), (3, _long_text("page_three")),
] ]
result = strat.chunk_pages(pages) result = asyncio.run(strat.chunk_pages(pages))
# Page 2 chunk should contain overlap from both neighbors # Page 2 chunk should contain overlap from both neighbors
middle_chunk, middle_page = result[1] middle_chunk, middle_page = result[1]
@ -156,7 +157,7 @@ def test_chunk_pages_returns_page_numbers():
(10, _long_text("ten")), (10, _long_text("ten")),
(99, _long_text("ninety_nine")), (99, _long_text("ninety_nine")),
] ]
result = strat.chunk_pages(pages) result = asyncio.run(strat.chunk_pages(pages))
assert len(result) == 3 assert len(result) == 3
output_pages = [pn for _, pn in result] output_pages = [pn for _, pn in result]
@ -171,7 +172,7 @@ def test_chunk_pages_custom_overlap():
(1, _long_text("aaa")), (1, _long_text("aaa")),
(2, _long_text("bbb")), (2, _long_text("bbb")),
] ]
result = strat.chunk_pages(pages, overlap_tokens=5) result = asyncio.run(strat.chunk_pages(pages, overlap_tokens=5))
assert len(result) == 2 assert len(result) == 2
# Both pages present # Both pages present
@ -183,7 +184,7 @@ def test_chunk_pages_custom_overlap():
assert "aaa" in result[1][0] assert "aaa" in result[1][0]
# Verify with zero overlap # Verify with zero overlap
result_zero = strat.chunk_pages(pages, overlap_tokens=0) result_zero = asyncio.run(strat.chunk_pages(pages, overlap_tokens=0))
# Page 1 chunk should NOT contain page 2 content # Page 1 chunk should NOT contain page 2 content
assert "bbb" not in result_zero[0][0] assert "bbb" not in result_zero[0][0]
# Page 2 chunk should NOT contain page 1 content # Page 2 chunk should NOT contain page 1 content
@ -194,7 +195,7 @@ def test_chunk_pages_output_format():
"""Each result element is a (str, int) tuple.""" """Each result element is a (str, int) tuple."""
strat = _make_strategy() strat = _make_strategy()
pages = [(1, "Short text one."), (2, "Short text two.")] pages = [(1, "Short text one."), (2, "Short text two.")]
result = strat.chunk_pages(pages) result = asyncio.run(strat.chunk_pages(pages))
for chunk_text, page_num in result: for chunk_text, page_num in result:
assert isinstance(chunk_text, str) assert isinstance(chunk_text, str)

View File

@ -199,7 +199,7 @@ def _mock_question_chunker(monkeypatch):
self._chunk_metadata = self._chunk_metadata[:1] self._chunk_metadata = self._chunk_metadata[:1]
return ["Question: What is X?\n\nAnswer: X is Y."] return ["Question: What is X?\n\nAnswer: X is Y."]
def chunk_pages(self, pages, overlap_tokens=0): async def chunk_pages(self, pages, overlap_tokens=0):
self._chunk_metadata = self._chunk_metadata[:1] self._chunk_metadata = self._chunk_metadata[:1]
return [("Question: What is X?\n\nAnswer: X is Y.", 1)] return [("Question: What is X?\n\nAnswer: X is Y.", 1)]

View File

@ -79,7 +79,7 @@ class TokenChunkingStrategy(ChunkingStrategy):
return chunks return chunks
def chunk_pages( async def chunk_pages(
self, pages: List[Tuple[int, str]], overlap_tokens: int = 200 self, pages: List[Tuple[int, str]], overlap_tokens: int = 200
) -> List[Tuple[str, int]]: ) -> List[Tuple[str, int]]:
"""Chunk page-segmented text with overlap from adjacent pages. """Chunk page-segmented text with overlap from adjacent pages.
@ -166,13 +166,16 @@ class QuestionChunkingStrategy(ChunkingStrategy):
self._chunk_metadata = [meta for _, _, meta in results] self._chunk_metadata = [meta for _, _, meta in results]
return [chunk_text for chunk_text, _, _ in results] return [chunk_text for chunk_text, _, _ in results]
def chunk_pages( async def chunk_pages(
self, pages: List[Tuple[int, str]], overlap_tokens: int = 0 self, pages: List[Tuple[int, str]], overlap_tokens: int = 0
) -> List[Tuple[str, int]]: ) -> List[Tuple[str, int]]:
"""Split page-segmented text using Q&A detection (for PDF). """Split page-segmented text using Q&A detection (for PDF).
Returns list of (chunk_text, page_number) where page_number Returns list of (chunk_text, page_number) where page_number
references the question location for Q&A chunks. references the question location for Q&A chunks.
When regex fast-pass fails and an LLM client is available,
calls the LLM for structure detection (async).
""" """
if not pages: if not pages:
return [] return []
@ -194,17 +197,12 @@ class QuestionChunkingStrategy(ChunkingStrategy):
sections = split_english_qa(full_text) sections = split_english_qa(full_text)
if not sections and self._llm_client is not None: if not sections and self._llm_client is not None:
import asyncio
prompt = build_structure_detection_prompt(full_text) prompt = build_structure_detection_prompt(full_text)
try: try:
loop = asyncio.get_event_loop() response = await self._llm_client.complete(
if loop.is_running(): prompt, temperature=0.3, step_name="StructureDetection"
sections = [] )
else: sections = parse_llm_structure_response(response)
response = loop.run_until_complete(
self._llm_client.complete(prompt, temperature=0.3, step_name="StructureDetection")
)
sections = parse_llm_structure_response(response)
except Exception: except Exception:
logger.warning("LLM structure detection failed, using fallback", exc_info=True) logger.warning("LLM structure detection failed, using fallback", exc_info=True)
@ -227,7 +225,22 @@ def get_chunking_strategy(name: str, settings: "Settings") -> ChunkingStrategy:
ChunkingStrategy instance. ChunkingStrategy instance.
""" """
if name == "question": if name == "question":
return QuestionChunkingStrategy(settings=settings) # Create llm_client if possible; fall back gracefully if config is incomplete.
llm_client = None
try:
from app.services.llm_client import LLMClient
client = LLMClient(settings=settings)
model = settings.qa_structure_model or settings.llm_model_name
if model and settings.llm_model_name:
client.model = model
llm_client = client
except Exception:
logger.warning(
"Could not create LLM client for Q&A chunking; "
"falling back to regex-only detection",
exc_info=True,
)
return QuestionChunkingStrategy(settings=settings, llm_client=llm_client)
return TokenChunkingStrategy( return TokenChunkingStrategy(
chunk_size=settings.chunk_size, chunk_size=settings.chunk_size,
overlap=settings.chunk_overlap, overlap=settings.chunk_overlap,