fix: Q\&A chunking always fell back to token — LLM never called, missing API fields
Three bugs caused 'Chunk by Question' to silently produce token chunks: 1. QuestionChunkingStrategy.chunk_pages() had a broken event-loop check that always skipped LLM structure detection in FastAPI's async context. Fixed by making chunk_pages() async and removing the is_running() guard. 2. get_chunking_strategy() factory never passed an LLMClient to QuestionChunkingStrategy. Fixed by creating LLMClient in the factory with graceful fallback to regex-only when config is incomplete. 3. rag.list_documents() and list_chunks() didn't extract strategy_type or Q&A fields from ChromaDB metadata, so the frontend always showed chunking_strategy='token' and null Q&A fields. Fixed by reading these fields from ChromaDB and routing them through the API. Also: TokenChunkingStrategy.chunk_pages() made async for consistency with the question strategy; ingest router updated to await it. Tests updated (asyncio.run() for sync tests, async mock chunk_pages).
This commit is contained in:
parent
9bef65de7b
commit
73c1789698
|
|
@ -32,6 +32,7 @@ async def list_documents():
|
||||||
filename=d["filename"],
|
filename=d["filename"],
|
||||||
chunk_count=d["chunk_count"],
|
chunk_count=d["chunk_count"],
|
||||||
upload_date=d["upload_date"],
|
upload_date=d["upload_date"],
|
||||||
|
chunking_strategy=d.get("chunking_strategy", "token"),
|
||||||
)
|
)
|
||||||
for d in doc_list
|
for d in doc_list
|
||||||
]
|
]
|
||||||
|
|
@ -59,6 +60,14 @@ async def list_chunks(document_id: str):
|
||||||
content_summary=c["content_summary"],
|
content_summary=c["content_summary"],
|
||||||
page_number=c.get("page_number"),
|
page_number=c.get("page_number"),
|
||||||
chunk_file_path=c.get("chunk_file_path"),
|
chunk_file_path=c.get("chunk_file_path"),
|
||||||
|
strategy_type=c.get("strategy_type"),
|
||||||
|
question_index=c.get("question_index"),
|
||||||
|
question_id=c.get("question_id"),
|
||||||
|
question_text=c.get("question_text"),
|
||||||
|
section_heading=c.get("section_heading"),
|
||||||
|
answer_contains_table=c.get("answer_contains_table"),
|
||||||
|
source_page_range=c.get("source_page_range"),
|
||||||
|
parent_topic=c.get("parent_topic"),
|
||||||
)
|
)
|
||||||
for c in chunks
|
for c in chunks
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -90,7 +90,7 @@ async def ingest_document(
|
||||||
detail="Document appears to be empty or could not be parsed",
|
detail="Document appears to be empty or could not be parsed",
|
||||||
)
|
)
|
||||||
|
|
||||||
chunked = chunker.chunk_pages(pages, overlap_tokens=settings.chunk_overlap)
|
chunked = await chunker.chunk_pages(pages, overlap_tokens=settings.chunk_overlap)
|
||||||
chunk_texts = [text for text, _ in chunked]
|
chunk_texts = [text for text, _ in chunked]
|
||||||
page_numbers = [pn for _, pn in chunked]
|
page_numbers = [pn for _, pn in chunked]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -280,7 +280,7 @@ class RAGService:
|
||||||
if not all_data["metadatas"]:
|
if not all_data["metadatas"]:
|
||||||
return [], 0, 0
|
return [], 0, 0
|
||||||
|
|
||||||
docs = defaultdict(lambda: {"filename": "", "chunk_count": 0, "upload_date": ""})
|
docs = defaultdict(lambda: {"filename": "", "chunk_count": 0, "upload_date": "", "chunking_strategy": "token"})
|
||||||
|
|
||||||
for chunk_id, meta in zip(all_data["ids"], all_data["metadatas"]):
|
for chunk_id, meta in zip(all_data["ids"], all_data["metadatas"]):
|
||||||
parts = chunk_id.rsplit("_", 1)
|
parts = chunk_id.rsplit("_", 1)
|
||||||
|
|
@ -289,6 +289,8 @@ class RAGService:
|
||||||
docs[doc_id]["filename"] = meta.get("filename", "unknown")
|
docs[doc_id]["filename"] = meta.get("filename", "unknown")
|
||||||
docs[doc_id]["chunk_count"] += 1
|
docs[doc_id]["chunk_count"] += 1
|
||||||
docs[doc_id]["upload_date"] = meta.get("upload_date", "")
|
docs[doc_id]["upload_date"] = meta.get("upload_date", "")
|
||||||
|
if meta.get("strategy_type") == "question":
|
||||||
|
docs[doc_id]["chunking_strategy"] = "question"
|
||||||
|
|
||||||
total_chunks = sum(d["chunk_count"] for d in docs.values())
|
total_chunks = sum(d["chunk_count"] for d in docs.values())
|
||||||
doc_list = [
|
doc_list = [
|
||||||
|
|
@ -297,6 +299,7 @@ class RAGService:
|
||||||
"filename": info["filename"],
|
"filename": info["filename"],
|
||||||
"chunk_count": info["chunk_count"],
|
"chunk_count": info["chunk_count"],
|
||||||
"upload_date": info["upload_date"],
|
"upload_date": info["upload_date"],
|
||||||
|
"chunking_strategy": info["chunking_strategy"],
|
||||||
}
|
}
|
||||||
for doc_id, info in docs.items()
|
for doc_id, info in docs.items()
|
||||||
]
|
]
|
||||||
|
|
@ -315,6 +318,14 @@ class RAGService:
|
||||||
"content_summary": meta.get("content_summary", ""),
|
"content_summary": meta.get("content_summary", ""),
|
||||||
"page_number": meta.get("page_number"),
|
"page_number": meta.get("page_number"),
|
||||||
"chunk_file_path": meta.get("chunk_file_path"),
|
"chunk_file_path": meta.get("chunk_file_path"),
|
||||||
|
"strategy_type": meta.get("strategy_type"),
|
||||||
|
"question_index": meta.get("question_index"),
|
||||||
|
"question_id": meta.get("question_id"),
|
||||||
|
"question_text": meta.get("question_text"),
|
||||||
|
"section_heading": meta.get("section_heading"),
|
||||||
|
"answer_contains_table": meta.get("answer_contains_table"),
|
||||||
|
"source_page_range": meta.get("source_page_range"),
|
||||||
|
"parent_topic": meta.get("parent_topic"),
|
||||||
})
|
})
|
||||||
|
|
||||||
chunks.sort(key=lambda x: x["chunk_index"])
|
chunks.sort(key=lambda x: x["chunk_index"])
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@ Tests for TokenChunkingStrategy.chunk_pages() which creates one chunk per page
|
||||||
with overlap context from adjacent pages.
|
with overlap context from adjacent pages.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import importlib.util
|
import importlib.util
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import pytest
|
import pytest
|
||||||
|
|
@ -45,7 +46,7 @@ def test_chunk_pages_basic():
|
||||||
(2, _long_text("beta")),
|
(2, _long_text("beta")),
|
||||||
(3, _long_text("gamma")),
|
(3, _long_text("gamma")),
|
||||||
]
|
]
|
||||||
result = strat.chunk_pages(pages)
|
result = asyncio.run(strat.chunk_pages(pages))
|
||||||
|
|
||||||
assert len(result) == 3
|
assert len(result) == 3
|
||||||
# Each result is (chunk_text, page_number)
|
# Each result is (chunk_text, page_number)
|
||||||
|
|
@ -61,7 +62,7 @@ def test_chunk_pages_single_page():
|
||||||
strat = _make_strategy()
|
strat = _make_strategy()
|
||||||
text = _long_text("solo")
|
text = _long_text("solo")
|
||||||
pages = [(1, text)]
|
pages = [(1, text)]
|
||||||
result = strat.chunk_pages(pages)
|
result = asyncio.run(strat.chunk_pages(pages))
|
||||||
|
|
||||||
assert len(result) == 1
|
assert len(result) == 1
|
||||||
chunk_text, page_num = result[0]
|
chunk_text, page_num = result[0]
|
||||||
|
|
@ -79,7 +80,7 @@ def test_chunk_pages_first_page():
|
||||||
(2, _long_text("second")),
|
(2, _long_text("second")),
|
||||||
(3, _long_text("third")),
|
(3, _long_text("third")),
|
||||||
]
|
]
|
||||||
result = strat.chunk_pages(pages)
|
result = asyncio.run(strat.chunk_pages(pages))
|
||||||
|
|
||||||
chunk_text, page_num = result[0]
|
chunk_text, page_num = result[0]
|
||||||
assert page_num == 1
|
assert page_num == 1
|
||||||
|
|
@ -97,7 +98,7 @@ def test_chunk_pages_last_page():
|
||||||
(2, _long_text("second")),
|
(2, _long_text("second")),
|
||||||
(3, _long_text("third")),
|
(3, _long_text("third")),
|
||||||
]
|
]
|
||||||
result = strat.chunk_pages(pages)
|
result = asyncio.run(strat.chunk_pages(pages))
|
||||||
|
|
||||||
chunk_text, page_num = result[-1]
|
chunk_text, page_num = result[-1]
|
||||||
assert page_num == 3
|
assert page_num == 3
|
||||||
|
|
@ -110,7 +111,7 @@ def test_chunk_pages_last_page():
|
||||||
def test_chunk_pages_empty_input():
|
def test_chunk_pages_empty_input():
|
||||||
"""Empty list returns empty list."""
|
"""Empty list returns empty list."""
|
||||||
strat = _make_strategy()
|
strat = _make_strategy()
|
||||||
result = strat.chunk_pages([])
|
result = asyncio.run(strat.chunk_pages([]))
|
||||||
assert result == []
|
assert result == []
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -126,7 +127,7 @@ def test_chunk_pages_overlap_content():
|
||||||
(2, _long_text("page_two")),
|
(2, _long_text("page_two")),
|
||||||
(3, _long_text("page_three")),
|
(3, _long_text("page_three")),
|
||||||
]
|
]
|
||||||
result = strat.chunk_pages(pages)
|
result = asyncio.run(strat.chunk_pages(pages))
|
||||||
|
|
||||||
# Page 2 chunk should contain overlap from both neighbors
|
# Page 2 chunk should contain overlap from both neighbors
|
||||||
middle_chunk, middle_page = result[1]
|
middle_chunk, middle_page = result[1]
|
||||||
|
|
@ -156,7 +157,7 @@ def test_chunk_pages_returns_page_numbers():
|
||||||
(10, _long_text("ten")),
|
(10, _long_text("ten")),
|
||||||
(99, _long_text("ninety_nine")),
|
(99, _long_text("ninety_nine")),
|
||||||
]
|
]
|
||||||
result = strat.chunk_pages(pages)
|
result = asyncio.run(strat.chunk_pages(pages))
|
||||||
|
|
||||||
assert len(result) == 3
|
assert len(result) == 3
|
||||||
output_pages = [pn for _, pn in result]
|
output_pages = [pn for _, pn in result]
|
||||||
|
|
@ -171,7 +172,7 @@ def test_chunk_pages_custom_overlap():
|
||||||
(1, _long_text("aaa")),
|
(1, _long_text("aaa")),
|
||||||
(2, _long_text("bbb")),
|
(2, _long_text("bbb")),
|
||||||
]
|
]
|
||||||
result = strat.chunk_pages(pages, overlap_tokens=5)
|
result = asyncio.run(strat.chunk_pages(pages, overlap_tokens=5))
|
||||||
|
|
||||||
assert len(result) == 2
|
assert len(result) == 2
|
||||||
# Both pages present
|
# Both pages present
|
||||||
|
|
@ -183,7 +184,7 @@ def test_chunk_pages_custom_overlap():
|
||||||
assert "aaa" in result[1][0]
|
assert "aaa" in result[1][0]
|
||||||
|
|
||||||
# Verify with zero overlap
|
# Verify with zero overlap
|
||||||
result_zero = strat.chunk_pages(pages, overlap_tokens=0)
|
result_zero = asyncio.run(strat.chunk_pages(pages, overlap_tokens=0))
|
||||||
# Page 1 chunk should NOT contain page 2 content
|
# Page 1 chunk should NOT contain page 2 content
|
||||||
assert "bbb" not in result_zero[0][0]
|
assert "bbb" not in result_zero[0][0]
|
||||||
# Page 2 chunk should NOT contain page 1 content
|
# Page 2 chunk should NOT contain page 1 content
|
||||||
|
|
@ -194,7 +195,7 @@ def test_chunk_pages_output_format():
|
||||||
"""Each result element is a (str, int) tuple."""
|
"""Each result element is a (str, int) tuple."""
|
||||||
strat = _make_strategy()
|
strat = _make_strategy()
|
||||||
pages = [(1, "Short text one."), (2, "Short text two.")]
|
pages = [(1, "Short text one."), (2, "Short text two.")]
|
||||||
result = strat.chunk_pages(pages)
|
result = asyncio.run(strat.chunk_pages(pages))
|
||||||
|
|
||||||
for chunk_text, page_num in result:
|
for chunk_text, page_num in result:
|
||||||
assert isinstance(chunk_text, str)
|
assert isinstance(chunk_text, str)
|
||||||
|
|
|
||||||
|
|
@ -199,7 +199,7 @@ def _mock_question_chunker(monkeypatch):
|
||||||
self._chunk_metadata = self._chunk_metadata[:1]
|
self._chunk_metadata = self._chunk_metadata[:1]
|
||||||
return ["Question: What is X?\n\nAnswer: X is Y."]
|
return ["Question: What is X?\n\nAnswer: X is Y."]
|
||||||
|
|
||||||
def chunk_pages(self, pages, overlap_tokens=0):
|
async def chunk_pages(self, pages, overlap_tokens=0):
|
||||||
self._chunk_metadata = self._chunk_metadata[:1]
|
self._chunk_metadata = self._chunk_metadata[:1]
|
||||||
return [("Question: What is X?\n\nAnswer: X is Y.", 1)]
|
return [("Question: What is X?\n\nAnswer: X is Y.", 1)]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -79,7 +79,7 @@ class TokenChunkingStrategy(ChunkingStrategy):
|
||||||
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
def chunk_pages(
|
async def chunk_pages(
|
||||||
self, pages: List[Tuple[int, str]], overlap_tokens: int = 200
|
self, pages: List[Tuple[int, str]], overlap_tokens: int = 200
|
||||||
) -> List[Tuple[str, int]]:
|
) -> List[Tuple[str, int]]:
|
||||||
"""Chunk page-segmented text with overlap from adjacent pages.
|
"""Chunk page-segmented text with overlap from adjacent pages.
|
||||||
|
|
@ -166,13 +166,16 @@ class QuestionChunkingStrategy(ChunkingStrategy):
|
||||||
self._chunk_metadata = [meta for _, _, meta in results]
|
self._chunk_metadata = [meta for _, _, meta in results]
|
||||||
return [chunk_text for chunk_text, _, _ in results]
|
return [chunk_text for chunk_text, _, _ in results]
|
||||||
|
|
||||||
def chunk_pages(
|
async def chunk_pages(
|
||||||
self, pages: List[Tuple[int, str]], overlap_tokens: int = 0
|
self, pages: List[Tuple[int, str]], overlap_tokens: int = 0
|
||||||
) -> List[Tuple[str, int]]:
|
) -> List[Tuple[str, int]]:
|
||||||
"""Split page-segmented text using Q&A detection (for PDF).
|
"""Split page-segmented text using Q&A detection (for PDF).
|
||||||
|
|
||||||
Returns list of (chunk_text, page_number) where page_number
|
Returns list of (chunk_text, page_number) where page_number
|
||||||
references the question location for Q&A chunks.
|
references the question location for Q&A chunks.
|
||||||
|
|
||||||
|
When regex fast-pass fails and an LLM client is available,
|
||||||
|
calls the LLM for structure detection (async).
|
||||||
"""
|
"""
|
||||||
if not pages:
|
if not pages:
|
||||||
return []
|
return []
|
||||||
|
|
@ -194,17 +197,12 @@ class QuestionChunkingStrategy(ChunkingStrategy):
|
||||||
sections = split_english_qa(full_text)
|
sections = split_english_qa(full_text)
|
||||||
|
|
||||||
if not sections and self._llm_client is not None:
|
if not sections and self._llm_client is not None:
|
||||||
import asyncio
|
|
||||||
prompt = build_structure_detection_prompt(full_text)
|
prompt = build_structure_detection_prompt(full_text)
|
||||||
try:
|
try:
|
||||||
loop = asyncio.get_event_loop()
|
response = await self._llm_client.complete(
|
||||||
if loop.is_running():
|
prompt, temperature=0.3, step_name="StructureDetection"
|
||||||
sections = []
|
)
|
||||||
else:
|
sections = parse_llm_structure_response(response)
|
||||||
response = loop.run_until_complete(
|
|
||||||
self._llm_client.complete(prompt, temperature=0.3, step_name="StructureDetection")
|
|
||||||
)
|
|
||||||
sections = parse_llm_structure_response(response)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.warning("LLM structure detection failed, using fallback", exc_info=True)
|
logger.warning("LLM structure detection failed, using fallback", exc_info=True)
|
||||||
|
|
||||||
|
|
@ -227,7 +225,22 @@ def get_chunking_strategy(name: str, settings: "Settings") -> ChunkingStrategy:
|
||||||
ChunkingStrategy instance.
|
ChunkingStrategy instance.
|
||||||
"""
|
"""
|
||||||
if name == "question":
|
if name == "question":
|
||||||
return QuestionChunkingStrategy(settings=settings)
|
# Create llm_client if possible; fall back gracefully if config is incomplete.
|
||||||
|
llm_client = None
|
||||||
|
try:
|
||||||
|
from app.services.llm_client import LLMClient
|
||||||
|
client = LLMClient(settings=settings)
|
||||||
|
model = settings.qa_structure_model or settings.llm_model_name
|
||||||
|
if model and settings.llm_model_name:
|
||||||
|
client.model = model
|
||||||
|
llm_client = client
|
||||||
|
except Exception:
|
||||||
|
logger.warning(
|
||||||
|
"Could not create LLM client for Q&A chunking; "
|
||||||
|
"falling back to regex-only detection",
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
return QuestionChunkingStrategy(settings=settings, llm_client=llm_client)
|
||||||
return TokenChunkingStrategy(
|
return TokenChunkingStrategy(
|
||||||
chunk_size=settings.chunk_size,
|
chunk_size=settings.chunk_size,
|
||||||
overlap=settings.chunk_overlap,
|
overlap=settings.chunk_overlap,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue