235 lines
7.9 KiB
Python
235 lines
7.9 KiB
Python
"""Chunking utilities for Phase 1.2.
|
|
|
|
Provides an abstract ChunkingStrategy and a concrete
|
|
TokenChunkingStrategy that uses tiktoken to chunk text into
|
|
token-based windows.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from abc import ABC, abstractmethod
|
|
from typing import TYPE_CHECKING, List, Optional, Tuple
|
|
|
|
if TYPE_CHECKING:
|
|
from app.core.config import Settings
|
|
from app.services.llm_client import LLMClient
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ChunkingStrategy(ABC):
|
|
"""Abstract base class for text chunking strategies."""
|
|
|
|
@abstractmethod
|
|
def chunk(self, text: str) -> List[str]:
|
|
"""Split text into a list of chunks (strings).
|
|
|
|
Implementations should return an empty list for empty or whitespace-only
|
|
input. The output chunks should be non-overlapping in terms of the produced
|
|
sequence when considering the token boundaries, but may overlap in raw text
|
|
due to token-based windowing.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
|
|
class TokenChunkingStrategy(ChunkingStrategy):
|
|
"""Chunk text by token windows using the tiktoken encoder.
|
|
|
|
The strategy operates on token counts: each chunk contains up to
|
|
chunk_size tokens with overlap of overlap tokens between consecutive chunks.
|
|
"""
|
|
|
|
def __init__(self, chunk_size: int = 1000, overlap: int = 200, encoding_name: str = "cl100k_base"):
|
|
if chunk_size <= 0:
|
|
raise ValueError("chunk_size must be positive")
|
|
if overlap < 0:
|
|
raise ValueError("overlap must be non-negative")
|
|
self.chunk_size = chunk_size
|
|
self.overlap = overlap
|
|
# Lazy import to avoid import-time penalties in environments without tokenizers
|
|
import tiktoken
|
|
|
|
self._encoding = tiktoken.get_encoding(encoding_name)
|
|
|
|
def chunk(self, text: str) -> List[str]:
|
|
if not isinstance(text, str):
|
|
raise TypeError("text must be a string")
|
|
if text.strip() == "":
|
|
return []
|
|
|
|
# Tokenize the input text
|
|
tokens = self._encoding.encode(text)
|
|
if not tokens:
|
|
return []
|
|
|
|
chunks: List[str] = []
|
|
step = self.chunk_size - self.overlap
|
|
if step <= 0:
|
|
step = 1 # ensure progress even with extreme overlap
|
|
|
|
for i in range(0, len(tokens), step):
|
|
segment = tokens[i : i + self.chunk_size]
|
|
if not segment:
|
|
break
|
|
chunk_text = self._encoding.decode(segment)
|
|
chunks.append(chunk_text)
|
|
# If we reached the end of the token array, break early
|
|
if len(segment) < self.chunk_size:
|
|
break
|
|
|
|
return chunks
|
|
|
|
def chunk_pages(
|
|
self, pages: List[Tuple[int, str]], overlap_tokens: int = 200
|
|
) -> List[Tuple[str, int]]:
|
|
"""Chunk page-segmented text with overlap from adjacent pages.
|
|
|
|
For each page, creates one chunk containing:
|
|
[last overlap_tokens of previous page] + [full current page] + [first overlap_tokens of next page]
|
|
|
|
One chunk per page — never splits a page even if oversized.
|
|
The page_number metadata always refers to the main page (N), not overlap pages.
|
|
|
|
Args:
|
|
pages: List of (page_number, page_text) tuples. 1-indexed.
|
|
overlap_tokens: Number of tokens to include from adjacent pages.
|
|
|
|
Returns:
|
|
List of (chunk_text, page_number) tuples. One chunk per page.
|
|
"""
|
|
if not pages:
|
|
return []
|
|
|
|
tokenized: List[List[int]] = [
|
|
self._encoding.encode(text) for _, text in pages
|
|
]
|
|
|
|
results: List[Tuple[str, int]] = []
|
|
|
|
for i, (page_num, page_text) in enumerate(pages):
|
|
parts: List[str] = []
|
|
|
|
if i > 0 and overlap_tokens > 0:
|
|
prev_tokens = tokenized[i - 1]
|
|
overlap_before = prev_tokens[-overlap_tokens:]
|
|
parts.append(self._encoding.decode(overlap_before))
|
|
|
|
parts.append(page_text)
|
|
|
|
if i < len(pages) - 1 and overlap_tokens > 0:
|
|
next_tokens = tokenized[i + 1]
|
|
overlap_after = next_tokens[:overlap_tokens]
|
|
parts.append(self._encoding.decode(overlap_after))
|
|
|
|
results.append(("\n".join(parts), page_num))
|
|
|
|
return results
|
|
|
|
|
|
class QuestionChunkingStrategy(ChunkingStrategy):
|
|
"""Chunk text by detecting Q&A structure using LLM and/or regex patterns.
|
|
|
|
Designed for LegCo documents with explicit 問/答 or Q1/Q2 markers.
|
|
Falls back to section-based chunking for narrative-only documents.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
settings: "Settings",
|
|
llm_client: Optional["LLMClient"] = None,
|
|
):
|
|
self._settings = settings
|
|
self._llm_client = llm_client
|
|
self._max_tokens = getattr(settings, "qa_max_chunk_tokens", 3000)
|
|
self._chunk_metadata: List[dict] = []
|
|
|
|
def chunk(self, text: str) -> List[str]:
|
|
"""Split text into chunks using Q&A detection (for DOCX/TXT)."""
|
|
if not text or not text.strip():
|
|
return []
|
|
|
|
from app.utils.qa_chunking import (
|
|
split_chinese_qa,
|
|
split_english_qa,
|
|
build_chunks_from_sections,
|
|
Section,
|
|
)
|
|
|
|
sections = split_chinese_qa(text)
|
|
if not sections:
|
|
sections = split_english_qa(text)
|
|
|
|
if not sections:
|
|
sections = [Section(type="narrative", content=text, start_page=1, end_page=1)]
|
|
|
|
results = build_chunks_from_sections(sections, max_tokens=self._max_tokens)
|
|
self._chunk_metadata = [meta for _, _, meta in results]
|
|
return [chunk_text for chunk_text, _, _ in results]
|
|
|
|
def chunk_pages(
|
|
self, pages: List[Tuple[int, str]], overlap_tokens: int = 0
|
|
) -> List[Tuple[str, int]]:
|
|
"""Split page-segmented text using Q&A detection (for PDF).
|
|
|
|
Returns list of (chunk_text, page_number) where page_number
|
|
references the question location for Q&A chunks.
|
|
"""
|
|
if not pages:
|
|
return []
|
|
|
|
from app.utils.qa_chunking import (
|
|
preprocess_text,
|
|
split_chinese_qa,
|
|
split_english_qa,
|
|
build_chunks_from_sections,
|
|
parse_llm_structure_response,
|
|
build_structure_detection_prompt,
|
|
Section,
|
|
)
|
|
|
|
full_text = preprocess_text(pages)
|
|
|
|
sections = split_chinese_qa(full_text)
|
|
if not sections:
|
|
sections = split_english_qa(full_text)
|
|
|
|
if not sections and self._llm_client is not None:
|
|
import asyncio
|
|
prompt = build_structure_detection_prompt(full_text)
|
|
try:
|
|
loop = asyncio.get_event_loop()
|
|
if loop.is_running():
|
|
sections = []
|
|
else:
|
|
response = loop.run_until_complete(
|
|
self._llm_client.complete(prompt, temperature=0.3, step_name="StructureDetection")
|
|
)
|
|
sections = parse_llm_structure_response(response)
|
|
except Exception:
|
|
logger.warning("LLM structure detection failed, using fallback", exc_info=True)
|
|
|
|
if not sections:
|
|
sections = [Section(type="narrative", content=full_text, start_page=1, end_page=len(pages))]
|
|
|
|
results = build_chunks_from_sections(sections, max_tokens=self._max_tokens)
|
|
self._chunk_metadata = [meta for _, _, meta in results]
|
|
return [(chunk_text, page_num) for chunk_text, page_num, _ in results]
|
|
|
|
|
|
def get_chunking_strategy(name: str, settings: "Settings") -> ChunkingStrategy:
|
|
"""Factory: return the named chunking strategy.
|
|
|
|
Args:
|
|
name: "token" or "question"
|
|
settings: Application settings instance.
|
|
|
|
Returns:
|
|
ChunkingStrategy instance.
|
|
"""
|
|
if name == "question":
|
|
return QuestionChunkingStrategy(settings=settings)
|
|
return TokenChunkingStrategy(
|
|
chunk_size=settings.chunk_size,
|
|
overlap=settings.chunk_overlap,
|
|
)
|