Merge branch 'RAG-workflow'

This commit is contained in:
Woody 2026-05-18 14:42:00 +08:00
commit 821159a198
6 changed files with 59 additions and 25 deletions

View File

@ -32,6 +32,7 @@ async def list_documents():
filename=d["filename"], filename=d["filename"],
chunk_count=d["chunk_count"], chunk_count=d["chunk_count"],
upload_date=d["upload_date"], upload_date=d["upload_date"],
chunking_strategy=d.get("chunking_strategy", "token"),
) )
for d in doc_list for d in doc_list
] ]
@ -59,6 +60,14 @@ async def list_chunks(document_id: str):
content_summary=c["content_summary"], content_summary=c["content_summary"],
page_number=c.get("page_number"), page_number=c.get("page_number"),
chunk_file_path=c.get("chunk_file_path"), chunk_file_path=c.get("chunk_file_path"),
strategy_type=c.get("strategy_type"),
question_index=c.get("question_index"),
question_id=c.get("question_id"),
question_text=c.get("question_text"),
section_heading=c.get("section_heading"),
answer_contains_table=c.get("answer_contains_table"),
source_page_range=c.get("source_page_range"),
parent_topic=c.get("parent_topic"),
) )
for c in chunks for c in chunks
] ]

View File

@ -90,7 +90,7 @@ async def ingest_document(
detail="Document appears to be empty or could not be parsed", detail="Document appears to be empty or could not be parsed",
) )
chunked = chunker.chunk_pages(pages, overlap_tokens=settings.chunk_overlap) chunked = await chunker.chunk_pages(pages, overlap_tokens=settings.chunk_overlap)
chunk_texts = [text for text, _ in chunked] chunk_texts = [text for text, _ in chunked]
page_numbers = [pn for _, pn in chunked] page_numbers = [pn for _, pn in chunked]

View File

@ -280,7 +280,7 @@ class RAGService:
if not all_data["metadatas"]: if not all_data["metadatas"]:
return [], 0, 0 return [], 0, 0
docs = defaultdict(lambda: {"filename": "", "chunk_count": 0, "upload_date": ""}) docs = defaultdict(lambda: {"filename": "", "chunk_count": 0, "upload_date": "", "chunking_strategy": "token"})
for chunk_id, meta in zip(all_data["ids"], all_data["metadatas"]): for chunk_id, meta in zip(all_data["ids"], all_data["metadatas"]):
parts = chunk_id.rsplit("_", 1) parts = chunk_id.rsplit("_", 1)
@ -289,6 +289,8 @@ class RAGService:
docs[doc_id]["filename"] = meta.get("filename", "unknown") docs[doc_id]["filename"] = meta.get("filename", "unknown")
docs[doc_id]["chunk_count"] += 1 docs[doc_id]["chunk_count"] += 1
docs[doc_id]["upload_date"] = meta.get("upload_date", "") docs[doc_id]["upload_date"] = meta.get("upload_date", "")
if meta.get("strategy_type") == "question":
docs[doc_id]["chunking_strategy"] = "question"
total_chunks = sum(d["chunk_count"] for d in docs.values()) total_chunks = sum(d["chunk_count"] for d in docs.values())
doc_list = [ doc_list = [
@ -297,6 +299,7 @@ class RAGService:
"filename": info["filename"], "filename": info["filename"],
"chunk_count": info["chunk_count"], "chunk_count": info["chunk_count"],
"upload_date": info["upload_date"], "upload_date": info["upload_date"],
"chunking_strategy": info["chunking_strategy"],
} }
for doc_id, info in docs.items() for doc_id, info in docs.items()
] ]
@ -315,6 +318,14 @@ class RAGService:
"content_summary": meta.get("content_summary", ""), "content_summary": meta.get("content_summary", ""),
"page_number": meta.get("page_number"), "page_number": meta.get("page_number"),
"chunk_file_path": meta.get("chunk_file_path"), "chunk_file_path": meta.get("chunk_file_path"),
"strategy_type": meta.get("strategy_type"),
"question_index": meta.get("question_index"),
"question_id": meta.get("question_id"),
"question_text": meta.get("question_text"),
"section_heading": meta.get("section_heading"),
"answer_contains_table": meta.get("answer_contains_table"),
"source_page_range": meta.get("source_page_range"),
"parent_topic": meta.get("parent_topic"),
}) })
chunks.sort(key=lambda x: x["chunk_index"]) chunks.sort(key=lambda x: x["chunk_index"])

View File

@ -4,6 +4,7 @@ Tests for TokenChunkingStrategy.chunk_pages() which creates one chunk per page
with overlap context from adjacent pages. with overlap context from adjacent pages.
""" """
import asyncio
import importlib.util import importlib.util
from pathlib import Path from pathlib import Path
import pytest import pytest
@ -45,7 +46,7 @@ def test_chunk_pages_basic():
(2, _long_text("beta")), (2, _long_text("beta")),
(3, _long_text("gamma")), (3, _long_text("gamma")),
] ]
result = strat.chunk_pages(pages) result = asyncio.run(strat.chunk_pages(pages))
assert len(result) == 3 assert len(result) == 3
# Each result is (chunk_text, page_number) # Each result is (chunk_text, page_number)
@ -61,7 +62,7 @@ def test_chunk_pages_single_page():
strat = _make_strategy() strat = _make_strategy()
text = _long_text("solo") text = _long_text("solo")
pages = [(1, text)] pages = [(1, text)]
result = strat.chunk_pages(pages) result = asyncio.run(strat.chunk_pages(pages))
assert len(result) == 1 assert len(result) == 1
chunk_text, page_num = result[0] chunk_text, page_num = result[0]
@ -79,7 +80,7 @@ def test_chunk_pages_first_page():
(2, _long_text("second")), (2, _long_text("second")),
(3, _long_text("third")), (3, _long_text("third")),
] ]
result = strat.chunk_pages(pages) result = asyncio.run(strat.chunk_pages(pages))
chunk_text, page_num = result[0] chunk_text, page_num = result[0]
assert page_num == 1 assert page_num == 1
@ -97,7 +98,7 @@ def test_chunk_pages_last_page():
(2, _long_text("second")), (2, _long_text("second")),
(3, _long_text("third")), (3, _long_text("third")),
] ]
result = strat.chunk_pages(pages) result = asyncio.run(strat.chunk_pages(pages))
chunk_text, page_num = result[-1] chunk_text, page_num = result[-1]
assert page_num == 3 assert page_num == 3
@ -110,7 +111,7 @@ def test_chunk_pages_last_page():
def test_chunk_pages_empty_input(): def test_chunk_pages_empty_input():
"""Empty list returns empty list.""" """Empty list returns empty list."""
strat = _make_strategy() strat = _make_strategy()
result = strat.chunk_pages([]) result = asyncio.run(strat.chunk_pages([]))
assert result == [] assert result == []
@ -126,7 +127,7 @@ def test_chunk_pages_overlap_content():
(2, _long_text("page_two")), (2, _long_text("page_two")),
(3, _long_text("page_three")), (3, _long_text("page_three")),
] ]
result = strat.chunk_pages(pages) result = asyncio.run(strat.chunk_pages(pages))
# Page 2 chunk should contain overlap from both neighbors # Page 2 chunk should contain overlap from both neighbors
middle_chunk, middle_page = result[1] middle_chunk, middle_page = result[1]
@ -156,7 +157,7 @@ def test_chunk_pages_returns_page_numbers():
(10, _long_text("ten")), (10, _long_text("ten")),
(99, _long_text("ninety_nine")), (99, _long_text("ninety_nine")),
] ]
result = strat.chunk_pages(pages) result = asyncio.run(strat.chunk_pages(pages))
assert len(result) == 3 assert len(result) == 3
output_pages = [pn for _, pn in result] output_pages = [pn for _, pn in result]
@ -171,7 +172,7 @@ def test_chunk_pages_custom_overlap():
(1, _long_text("aaa")), (1, _long_text("aaa")),
(2, _long_text("bbb")), (2, _long_text("bbb")),
] ]
result = strat.chunk_pages(pages, overlap_tokens=5) result = asyncio.run(strat.chunk_pages(pages, overlap_tokens=5))
assert len(result) == 2 assert len(result) == 2
# Both pages present # Both pages present
@ -183,7 +184,7 @@ def test_chunk_pages_custom_overlap():
assert "aaa" in result[1][0] assert "aaa" in result[1][0]
# Verify with zero overlap # Verify with zero overlap
result_zero = strat.chunk_pages(pages, overlap_tokens=0) result_zero = asyncio.run(strat.chunk_pages(pages, overlap_tokens=0))
# Page 1 chunk should NOT contain page 2 content # Page 1 chunk should NOT contain page 2 content
assert "bbb" not in result_zero[0][0] assert "bbb" not in result_zero[0][0]
# Page 2 chunk should NOT contain page 1 content # Page 2 chunk should NOT contain page 1 content
@ -194,7 +195,7 @@ def test_chunk_pages_output_format():
"""Each result element is a (str, int) tuple.""" """Each result element is a (str, int) tuple."""
strat = _make_strategy() strat = _make_strategy()
pages = [(1, "Short text one."), (2, "Short text two.")] pages = [(1, "Short text one."), (2, "Short text two.")]
result = strat.chunk_pages(pages) result = asyncio.run(strat.chunk_pages(pages))
for chunk_text, page_num in result: for chunk_text, page_num in result:
assert isinstance(chunk_text, str) assert isinstance(chunk_text, str)

View File

@ -199,7 +199,7 @@ def _mock_question_chunker(monkeypatch):
self._chunk_metadata = self._chunk_metadata[:1] self._chunk_metadata = self._chunk_metadata[:1]
return ["Question: What is X?\n\nAnswer: X is Y."] return ["Question: What is X?\n\nAnswer: X is Y."]
def chunk_pages(self, pages, overlap_tokens=0): async def chunk_pages(self, pages, overlap_tokens=0):
self._chunk_metadata = self._chunk_metadata[:1] self._chunk_metadata = self._chunk_metadata[:1]
return [("Question: What is X?\n\nAnswer: X is Y.", 1)] return [("Question: What is X?\n\nAnswer: X is Y.", 1)]

View File

@ -79,7 +79,7 @@ class TokenChunkingStrategy(ChunkingStrategy):
return chunks return chunks
def chunk_pages( async def chunk_pages(
self, pages: List[Tuple[int, str]], overlap_tokens: int = 200 self, pages: List[Tuple[int, str]], overlap_tokens: int = 200
) -> List[Tuple[str, int]]: ) -> List[Tuple[str, int]]:
"""Chunk page-segmented text with overlap from adjacent pages. """Chunk page-segmented text with overlap from adjacent pages.
@ -166,13 +166,16 @@ class QuestionChunkingStrategy(ChunkingStrategy):
self._chunk_metadata = [meta for _, _, meta in results] self._chunk_metadata = [meta for _, _, meta in results]
return [chunk_text for chunk_text, _, _ in results] return [chunk_text for chunk_text, _, _ in results]
def chunk_pages( async def chunk_pages(
self, pages: List[Tuple[int, str]], overlap_tokens: int = 0 self, pages: List[Tuple[int, str]], overlap_tokens: int = 0
) -> List[Tuple[str, int]]: ) -> List[Tuple[str, int]]:
"""Split page-segmented text using Q&A detection (for PDF). """Split page-segmented text using Q&A detection (for PDF).
Returns list of (chunk_text, page_number) where page_number Returns list of (chunk_text, page_number) where page_number
references the question location for Q&A chunks. references the question location for Q&A chunks.
When regex fast-pass fails and an LLM client is available,
calls the LLM for structure detection (async).
""" """
if not pages: if not pages:
return [] return []
@ -194,15 +197,10 @@ class QuestionChunkingStrategy(ChunkingStrategy):
sections = split_english_qa(full_text) sections = split_english_qa(full_text)
if not sections and self._llm_client is not None: if not sections and self._llm_client is not None:
import asyncio
prompt = build_structure_detection_prompt(full_text) prompt = build_structure_detection_prompt(full_text)
try: try:
loop = asyncio.get_event_loop() response = await self._llm_client.complete(
if loop.is_running(): prompt, temperature=0.3, step_name="StructureDetection"
sections = []
else:
response = loop.run_until_complete(
self._llm_client.complete(prompt, temperature=0.3, step_name="StructureDetection")
) )
sections = parse_llm_structure_response(response) sections = parse_llm_structure_response(response)
except Exception: except Exception:
@ -227,7 +225,22 @@ def get_chunking_strategy(name: str, settings: "Settings") -> ChunkingStrategy:
ChunkingStrategy instance. ChunkingStrategy instance.
""" """
if name == "question": if name == "question":
return QuestionChunkingStrategy(settings=settings) # Create llm_client if possible; fall back gracefully if config is incomplete.
llm_client = None
try:
from app.services.llm_client import LLMClient
client = LLMClient(settings=settings)
model = settings.qa_structure_model or settings.llm_model_name
if model and settings.llm_model_name:
client.model = model
llm_client = client
except Exception:
logger.warning(
"Could not create LLM client for Q&A chunking; "
"falling back to regex-only detection",
exc_info=True,
)
return QuestionChunkingStrategy(settings=settings, llm_client=llm_client)
return TokenChunkingStrategy( return TokenChunkingStrategy(
chunk_size=settings.chunk_size, chunk_size=settings.chunk_size,
overlap=settings.chunk_overlap, overlap=settings.chunk_overlap,