Merge branch 'RAG-workflow'

This commit is contained in:
Woody 2026-05-18 14:42:00 +08:00
commit 821159a198
6 changed files with 59 additions and 25 deletions

View File

@ -32,6 +32,7 @@ async def list_documents():
filename=d["filename"],
chunk_count=d["chunk_count"],
upload_date=d["upload_date"],
chunking_strategy=d.get("chunking_strategy", "token"),
)
for d in doc_list
]
@ -59,6 +60,14 @@ async def list_chunks(document_id: str):
content_summary=c["content_summary"],
page_number=c.get("page_number"),
chunk_file_path=c.get("chunk_file_path"),
strategy_type=c.get("strategy_type"),
question_index=c.get("question_index"),
question_id=c.get("question_id"),
question_text=c.get("question_text"),
section_heading=c.get("section_heading"),
answer_contains_table=c.get("answer_contains_table"),
source_page_range=c.get("source_page_range"),
parent_topic=c.get("parent_topic"),
)
for c in chunks
]

View File

@ -90,7 +90,7 @@ async def ingest_document(
detail="Document appears to be empty or could not be parsed",
)
chunked = chunker.chunk_pages(pages, overlap_tokens=settings.chunk_overlap)
chunked = await chunker.chunk_pages(pages, overlap_tokens=settings.chunk_overlap)
chunk_texts = [text for text, _ in chunked]
page_numbers = [pn for _, pn in chunked]

View File

@ -280,7 +280,7 @@ class RAGService:
if not all_data["metadatas"]:
return [], 0, 0
docs = defaultdict(lambda: {"filename": "", "chunk_count": 0, "upload_date": ""})
docs = defaultdict(lambda: {"filename": "", "chunk_count": 0, "upload_date": "", "chunking_strategy": "token"})
for chunk_id, meta in zip(all_data["ids"], all_data["metadatas"]):
parts = chunk_id.rsplit("_", 1)
@ -289,6 +289,8 @@ class RAGService:
docs[doc_id]["filename"] = meta.get("filename", "unknown")
docs[doc_id]["chunk_count"] += 1
docs[doc_id]["upload_date"] = meta.get("upload_date", "")
if meta.get("strategy_type") == "question":
docs[doc_id]["chunking_strategy"] = "question"
total_chunks = sum(d["chunk_count"] for d in docs.values())
doc_list = [
@ -297,6 +299,7 @@ class RAGService:
"filename": info["filename"],
"chunk_count": info["chunk_count"],
"upload_date": info["upload_date"],
"chunking_strategy": info["chunking_strategy"],
}
for doc_id, info in docs.items()
]
@ -315,6 +318,14 @@ class RAGService:
"content_summary": meta.get("content_summary", ""),
"page_number": meta.get("page_number"),
"chunk_file_path": meta.get("chunk_file_path"),
"strategy_type": meta.get("strategy_type"),
"question_index": meta.get("question_index"),
"question_id": meta.get("question_id"),
"question_text": meta.get("question_text"),
"section_heading": meta.get("section_heading"),
"answer_contains_table": meta.get("answer_contains_table"),
"source_page_range": meta.get("source_page_range"),
"parent_topic": meta.get("parent_topic"),
})
chunks.sort(key=lambda x: x["chunk_index"])

View File

@ -4,6 +4,7 @@ Tests for TokenChunkingStrategy.chunk_pages() which creates one chunk per page
with overlap context from adjacent pages.
"""
import asyncio
import importlib.util
from pathlib import Path
import pytest
@ -45,7 +46,7 @@ def test_chunk_pages_basic():
(2, _long_text("beta")),
(3, _long_text("gamma")),
]
result = strat.chunk_pages(pages)
result = asyncio.run(strat.chunk_pages(pages))
assert len(result) == 3
# Each result is (chunk_text, page_number)
@ -61,7 +62,7 @@ def test_chunk_pages_single_page():
strat = _make_strategy()
text = _long_text("solo")
pages = [(1, text)]
result = strat.chunk_pages(pages)
result = asyncio.run(strat.chunk_pages(pages))
assert len(result) == 1
chunk_text, page_num = result[0]
@ -79,7 +80,7 @@ def test_chunk_pages_first_page():
(2, _long_text("second")),
(3, _long_text("third")),
]
result = strat.chunk_pages(pages)
result = asyncio.run(strat.chunk_pages(pages))
chunk_text, page_num = result[0]
assert page_num == 1
@ -97,7 +98,7 @@ def test_chunk_pages_last_page():
(2, _long_text("second")),
(3, _long_text("third")),
]
result = strat.chunk_pages(pages)
result = asyncio.run(strat.chunk_pages(pages))
chunk_text, page_num = result[-1]
assert page_num == 3
@ -110,7 +111,7 @@ def test_chunk_pages_last_page():
def test_chunk_pages_empty_input():
"""Empty list returns empty list."""
strat = _make_strategy()
result = strat.chunk_pages([])
result = asyncio.run(strat.chunk_pages([]))
assert result == []
@ -126,7 +127,7 @@ def test_chunk_pages_overlap_content():
(2, _long_text("page_two")),
(3, _long_text("page_three")),
]
result = strat.chunk_pages(pages)
result = asyncio.run(strat.chunk_pages(pages))
# Page 2 chunk should contain overlap from both neighbors
middle_chunk, middle_page = result[1]
@ -156,7 +157,7 @@ def test_chunk_pages_returns_page_numbers():
(10, _long_text("ten")),
(99, _long_text("ninety_nine")),
]
result = strat.chunk_pages(pages)
result = asyncio.run(strat.chunk_pages(pages))
assert len(result) == 3
output_pages = [pn for _, pn in result]
@ -171,7 +172,7 @@ def test_chunk_pages_custom_overlap():
(1, _long_text("aaa")),
(2, _long_text("bbb")),
]
result = strat.chunk_pages(pages, overlap_tokens=5)
result = asyncio.run(strat.chunk_pages(pages, overlap_tokens=5))
assert len(result) == 2
# Both pages present
@ -183,7 +184,7 @@ def test_chunk_pages_custom_overlap():
assert "aaa" in result[1][0]
# Verify with zero overlap
result_zero = strat.chunk_pages(pages, overlap_tokens=0)
result_zero = asyncio.run(strat.chunk_pages(pages, overlap_tokens=0))
# Page 1 chunk should NOT contain page 2 content
assert "bbb" not in result_zero[0][0]
# Page 2 chunk should NOT contain page 1 content
@ -194,7 +195,7 @@ def test_chunk_pages_output_format():
"""Each result element is a (str, int) tuple."""
strat = _make_strategy()
pages = [(1, "Short text one."), (2, "Short text two.")]
result = strat.chunk_pages(pages)
result = asyncio.run(strat.chunk_pages(pages))
for chunk_text, page_num in result:
assert isinstance(chunk_text, str)

View File

@ -199,7 +199,7 @@ def _mock_question_chunker(monkeypatch):
self._chunk_metadata = self._chunk_metadata[:1]
return ["Question: What is X?\n\nAnswer: X is Y."]
def chunk_pages(self, pages, overlap_tokens=0):
async def chunk_pages(self, pages, overlap_tokens=0):
self._chunk_metadata = self._chunk_metadata[:1]
return [("Question: What is X?\n\nAnswer: X is Y.", 1)]

View File

@ -79,7 +79,7 @@ class TokenChunkingStrategy(ChunkingStrategy):
return chunks
def chunk_pages(
async def chunk_pages(
self, pages: List[Tuple[int, str]], overlap_tokens: int = 200
) -> List[Tuple[str, int]]:
"""Chunk page-segmented text with overlap from adjacent pages.
@ -166,13 +166,16 @@ class QuestionChunkingStrategy(ChunkingStrategy):
self._chunk_metadata = [meta for _, _, meta in results]
return [chunk_text for chunk_text, _, _ in results]
def chunk_pages(
async def chunk_pages(
self, pages: List[Tuple[int, str]], overlap_tokens: int = 0
) -> List[Tuple[str, int]]:
"""Split page-segmented text using Q&A detection (for PDF).
Returns list of (chunk_text, page_number) where page_number
references the question location for Q&A chunks.
When regex fast-pass fails and an LLM client is available,
calls the LLM for structure detection (async).
"""
if not pages:
return []
@ -194,17 +197,12 @@ class QuestionChunkingStrategy(ChunkingStrategy):
sections = split_english_qa(full_text)
if not sections and self._llm_client is not None:
import asyncio
prompt = build_structure_detection_prompt(full_text)
try:
loop = asyncio.get_event_loop()
if loop.is_running():
sections = []
else:
response = loop.run_until_complete(
self._llm_client.complete(prompt, temperature=0.3, step_name="StructureDetection")
)
sections = parse_llm_structure_response(response)
response = await self._llm_client.complete(
prompt, temperature=0.3, step_name="StructureDetection"
)
sections = parse_llm_structure_response(response)
except Exception:
logger.warning("LLM structure detection failed, using fallback", exc_info=True)
@ -227,7 +225,22 @@ def get_chunking_strategy(name: str, settings: "Settings") -> ChunkingStrategy:
ChunkingStrategy instance.
"""
if name == "question":
return QuestionChunkingStrategy(settings=settings)
# Create llm_client if possible; fall back gracefully if config is incomplete.
llm_client = None
try:
from app.services.llm_client import LLMClient
client = LLMClient(settings=settings)
model = settings.qa_structure_model or settings.llm_model_name
if model and settings.llm_model_name:
client.model = model
llm_client = client
except Exception:
logger.warning(
"Could not create LLM client for Q&A chunking; "
"falling back to regex-only detection",
exc_info=True,
)
return QuestionChunkingStrategy(settings=settings, llm_client=llm_client)
return TokenChunkingStrategy(
chunk_size=settings.chunk_size,
overlap=settings.chunk_overlap,