fix: Q\&A chunking always fell back to token — LLM never called, missing API fields
Three bugs caused 'Chunk by Question' to silently produce token chunks: 1. QuestionChunkingStrategy.chunk_pages() had a broken event-loop check that always skipped LLM structure detection in FastAPI's async context. Fixed by making chunk_pages() async and removing the is_running() guard. 2. get_chunking_strategy() factory never passed an LLMClient to QuestionChunkingStrategy. Fixed by creating LLMClient in the factory with graceful fallback to regex-only when config is incomplete. 3. rag.list_documents() and list_chunks() didn't extract strategy_type or Q&A fields from ChromaDB metadata, so the frontend always showed chunking_strategy='token' and null Q&A fields. Fixed by reading these fields from ChromaDB and routing them through the API. Also: TokenChunkingStrategy.chunk_pages() made async for consistency with the question strategy; ingest router updated to await it. Tests updated (asyncio.run() for sync tests, async mock chunk_pages).
This commit is contained in:
parent
9bef65de7b
commit
73c1789698
|
|
@ -32,6 +32,7 @@ async def list_documents():
|
|||
filename=d["filename"],
|
||||
chunk_count=d["chunk_count"],
|
||||
upload_date=d["upload_date"],
|
||||
chunking_strategy=d.get("chunking_strategy", "token"),
|
||||
)
|
||||
for d in doc_list
|
||||
]
|
||||
|
|
@ -59,6 +60,14 @@ async def list_chunks(document_id: str):
|
|||
content_summary=c["content_summary"],
|
||||
page_number=c.get("page_number"),
|
||||
chunk_file_path=c.get("chunk_file_path"),
|
||||
strategy_type=c.get("strategy_type"),
|
||||
question_index=c.get("question_index"),
|
||||
question_id=c.get("question_id"),
|
||||
question_text=c.get("question_text"),
|
||||
section_heading=c.get("section_heading"),
|
||||
answer_contains_table=c.get("answer_contains_table"),
|
||||
source_page_range=c.get("source_page_range"),
|
||||
parent_topic=c.get("parent_topic"),
|
||||
)
|
||||
for c in chunks
|
||||
]
|
||||
|
|
|
|||
|
|
@ -90,7 +90,7 @@ async def ingest_document(
|
|||
detail="Document appears to be empty or could not be parsed",
|
||||
)
|
||||
|
||||
chunked = chunker.chunk_pages(pages, overlap_tokens=settings.chunk_overlap)
|
||||
chunked = await chunker.chunk_pages(pages, overlap_tokens=settings.chunk_overlap)
|
||||
chunk_texts = [text for text, _ in chunked]
|
||||
page_numbers = [pn for _, pn in chunked]
|
||||
|
||||
|
|
|
|||
|
|
@ -280,7 +280,7 @@ class RAGService:
|
|||
if not all_data["metadatas"]:
|
||||
return [], 0, 0
|
||||
|
||||
docs = defaultdict(lambda: {"filename": "", "chunk_count": 0, "upload_date": ""})
|
||||
docs = defaultdict(lambda: {"filename": "", "chunk_count": 0, "upload_date": "", "chunking_strategy": "token"})
|
||||
|
||||
for chunk_id, meta in zip(all_data["ids"], all_data["metadatas"]):
|
||||
parts = chunk_id.rsplit("_", 1)
|
||||
|
|
@ -289,6 +289,8 @@ class RAGService:
|
|||
docs[doc_id]["filename"] = meta.get("filename", "unknown")
|
||||
docs[doc_id]["chunk_count"] += 1
|
||||
docs[doc_id]["upload_date"] = meta.get("upload_date", "")
|
||||
if meta.get("strategy_type") == "question":
|
||||
docs[doc_id]["chunking_strategy"] = "question"
|
||||
|
||||
total_chunks = sum(d["chunk_count"] for d in docs.values())
|
||||
doc_list = [
|
||||
|
|
@ -297,6 +299,7 @@ class RAGService:
|
|||
"filename": info["filename"],
|
||||
"chunk_count": info["chunk_count"],
|
||||
"upload_date": info["upload_date"],
|
||||
"chunking_strategy": info["chunking_strategy"],
|
||||
}
|
||||
for doc_id, info in docs.items()
|
||||
]
|
||||
|
|
@ -315,6 +318,14 @@ class RAGService:
|
|||
"content_summary": meta.get("content_summary", ""),
|
||||
"page_number": meta.get("page_number"),
|
||||
"chunk_file_path": meta.get("chunk_file_path"),
|
||||
"strategy_type": meta.get("strategy_type"),
|
||||
"question_index": meta.get("question_index"),
|
||||
"question_id": meta.get("question_id"),
|
||||
"question_text": meta.get("question_text"),
|
||||
"section_heading": meta.get("section_heading"),
|
||||
"answer_contains_table": meta.get("answer_contains_table"),
|
||||
"source_page_range": meta.get("source_page_range"),
|
||||
"parent_topic": meta.get("parent_topic"),
|
||||
})
|
||||
|
||||
chunks.sort(key=lambda x: x["chunk_index"])
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ Tests for TokenChunkingStrategy.chunk_pages() which creates one chunk per page
|
|||
with overlap context from adjacent pages.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import importlib.util
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
|
@ -45,7 +46,7 @@ def test_chunk_pages_basic():
|
|||
(2, _long_text("beta")),
|
||||
(3, _long_text("gamma")),
|
||||
]
|
||||
result = strat.chunk_pages(pages)
|
||||
result = asyncio.run(strat.chunk_pages(pages))
|
||||
|
||||
assert len(result) == 3
|
||||
# Each result is (chunk_text, page_number)
|
||||
|
|
@ -61,7 +62,7 @@ def test_chunk_pages_single_page():
|
|||
strat = _make_strategy()
|
||||
text = _long_text("solo")
|
||||
pages = [(1, text)]
|
||||
result = strat.chunk_pages(pages)
|
||||
result = asyncio.run(strat.chunk_pages(pages))
|
||||
|
||||
assert len(result) == 1
|
||||
chunk_text, page_num = result[0]
|
||||
|
|
@ -79,7 +80,7 @@ def test_chunk_pages_first_page():
|
|||
(2, _long_text("second")),
|
||||
(3, _long_text("third")),
|
||||
]
|
||||
result = strat.chunk_pages(pages)
|
||||
result = asyncio.run(strat.chunk_pages(pages))
|
||||
|
||||
chunk_text, page_num = result[0]
|
||||
assert page_num == 1
|
||||
|
|
@ -97,7 +98,7 @@ def test_chunk_pages_last_page():
|
|||
(2, _long_text("second")),
|
||||
(3, _long_text("third")),
|
||||
]
|
||||
result = strat.chunk_pages(pages)
|
||||
result = asyncio.run(strat.chunk_pages(pages))
|
||||
|
||||
chunk_text, page_num = result[-1]
|
||||
assert page_num == 3
|
||||
|
|
@ -110,7 +111,7 @@ def test_chunk_pages_last_page():
|
|||
def test_chunk_pages_empty_input():
|
||||
"""Empty list returns empty list."""
|
||||
strat = _make_strategy()
|
||||
result = strat.chunk_pages([])
|
||||
result = asyncio.run(strat.chunk_pages([]))
|
||||
assert result == []
|
||||
|
||||
|
||||
|
|
@ -126,7 +127,7 @@ def test_chunk_pages_overlap_content():
|
|||
(2, _long_text("page_two")),
|
||||
(3, _long_text("page_three")),
|
||||
]
|
||||
result = strat.chunk_pages(pages)
|
||||
result = asyncio.run(strat.chunk_pages(pages))
|
||||
|
||||
# Page 2 chunk should contain overlap from both neighbors
|
||||
middle_chunk, middle_page = result[1]
|
||||
|
|
@ -156,7 +157,7 @@ def test_chunk_pages_returns_page_numbers():
|
|||
(10, _long_text("ten")),
|
||||
(99, _long_text("ninety_nine")),
|
||||
]
|
||||
result = strat.chunk_pages(pages)
|
||||
result = asyncio.run(strat.chunk_pages(pages))
|
||||
|
||||
assert len(result) == 3
|
||||
output_pages = [pn for _, pn in result]
|
||||
|
|
@ -171,7 +172,7 @@ def test_chunk_pages_custom_overlap():
|
|||
(1, _long_text("aaa")),
|
||||
(2, _long_text("bbb")),
|
||||
]
|
||||
result = strat.chunk_pages(pages, overlap_tokens=5)
|
||||
result = asyncio.run(strat.chunk_pages(pages, overlap_tokens=5))
|
||||
|
||||
assert len(result) == 2
|
||||
# Both pages present
|
||||
|
|
@ -183,7 +184,7 @@ def test_chunk_pages_custom_overlap():
|
|||
assert "aaa" in result[1][0]
|
||||
|
||||
# Verify with zero overlap
|
||||
result_zero = strat.chunk_pages(pages, overlap_tokens=0)
|
||||
result_zero = asyncio.run(strat.chunk_pages(pages, overlap_tokens=0))
|
||||
# Page 1 chunk should NOT contain page 2 content
|
||||
assert "bbb" not in result_zero[0][0]
|
||||
# Page 2 chunk should NOT contain page 1 content
|
||||
|
|
@ -194,7 +195,7 @@ def test_chunk_pages_output_format():
|
|||
"""Each result element is a (str, int) tuple."""
|
||||
strat = _make_strategy()
|
||||
pages = [(1, "Short text one."), (2, "Short text two.")]
|
||||
result = strat.chunk_pages(pages)
|
||||
result = asyncio.run(strat.chunk_pages(pages))
|
||||
|
||||
for chunk_text, page_num in result:
|
||||
assert isinstance(chunk_text, str)
|
||||
|
|
|
|||
|
|
@ -199,7 +199,7 @@ def _mock_question_chunker(monkeypatch):
|
|||
self._chunk_metadata = self._chunk_metadata[:1]
|
||||
return ["Question: What is X?\n\nAnswer: X is Y."]
|
||||
|
||||
def chunk_pages(self, pages, overlap_tokens=0):
|
||||
async def chunk_pages(self, pages, overlap_tokens=0):
|
||||
self._chunk_metadata = self._chunk_metadata[:1]
|
||||
return [("Question: What is X?\n\nAnswer: X is Y.", 1)]
|
||||
|
||||
|
|
|
|||
|
|
@ -79,7 +79,7 @@ class TokenChunkingStrategy(ChunkingStrategy):
|
|||
|
||||
return chunks
|
||||
|
||||
def chunk_pages(
|
||||
async def chunk_pages(
|
||||
self, pages: List[Tuple[int, str]], overlap_tokens: int = 200
|
||||
) -> List[Tuple[str, int]]:
|
||||
"""Chunk page-segmented text with overlap from adjacent pages.
|
||||
|
|
@ -166,13 +166,16 @@ class QuestionChunkingStrategy(ChunkingStrategy):
|
|||
self._chunk_metadata = [meta for _, _, meta in results]
|
||||
return [chunk_text for chunk_text, _, _ in results]
|
||||
|
||||
def chunk_pages(
|
||||
async def chunk_pages(
|
||||
self, pages: List[Tuple[int, str]], overlap_tokens: int = 0
|
||||
) -> List[Tuple[str, int]]:
|
||||
"""Split page-segmented text using Q&A detection (for PDF).
|
||||
|
||||
Returns list of (chunk_text, page_number) where page_number
|
||||
references the question location for Q&A chunks.
|
||||
|
||||
When regex fast-pass fails and an LLM client is available,
|
||||
calls the LLM for structure detection (async).
|
||||
"""
|
||||
if not pages:
|
||||
return []
|
||||
|
|
@ -194,17 +197,12 @@ class QuestionChunkingStrategy(ChunkingStrategy):
|
|||
sections = split_english_qa(full_text)
|
||||
|
||||
if not sections and self._llm_client is not None:
|
||||
import asyncio
|
||||
prompt = build_structure_detection_prompt(full_text)
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
if loop.is_running():
|
||||
sections = []
|
||||
else:
|
||||
response = loop.run_until_complete(
|
||||
self._llm_client.complete(prompt, temperature=0.3, step_name="StructureDetection")
|
||||
)
|
||||
sections = parse_llm_structure_response(response)
|
||||
response = await self._llm_client.complete(
|
||||
prompt, temperature=0.3, step_name="StructureDetection"
|
||||
)
|
||||
sections = parse_llm_structure_response(response)
|
||||
except Exception:
|
||||
logger.warning("LLM structure detection failed, using fallback", exc_info=True)
|
||||
|
||||
|
|
@ -227,7 +225,22 @@ def get_chunking_strategy(name: str, settings: "Settings") -> ChunkingStrategy:
|
|||
ChunkingStrategy instance.
|
||||
"""
|
||||
if name == "question":
|
||||
return QuestionChunkingStrategy(settings=settings)
|
||||
# Create llm_client if possible; fall back gracefully if config is incomplete.
|
||||
llm_client = None
|
||||
try:
|
||||
from app.services.llm_client import LLMClient
|
||||
client = LLMClient(settings=settings)
|
||||
model = settings.qa_structure_model or settings.llm_model_name
|
||||
if model and settings.llm_model_name:
|
||||
client.model = model
|
||||
llm_client = client
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"Could not create LLM client for Q&A chunking; "
|
||||
"falling back to regex-only detection",
|
||||
exc_info=True,
|
||||
)
|
||||
return QuestionChunkingStrategy(settings=settings, llm_client=llm_client)
|
||||
return TokenChunkingStrategy(
|
||||
chunk_size=settings.chunk_size,
|
||||
overlap=settings.chunk_overlap,
|
||||
|
|
|
|||
Loading…
Reference in New Issue