"""Chunking utilities for Phase 1.2. Provides an abstract ChunkingStrategy and a concrete TokenChunkingStrategy that uses tiktoken to chunk text into token-based windows. """ from __future__ import annotations import logging from abc import ABC, abstractmethod from typing import TYPE_CHECKING, List, Optional, Tuple if TYPE_CHECKING: from app.core.config import Settings from app.services.llm_client import LLMClient logger = logging.getLogger(__name__) class ChunkingStrategy(ABC): """Abstract base class for text chunking strategies.""" @abstractmethod def chunk(self, text: str) -> List[str]: """Split text into a list of chunks (strings). Implementations should return an empty list for empty or whitespace-only input. The output chunks should be non-overlapping in terms of the produced sequence when considering the token boundaries, but may overlap in raw text due to token-based windowing. """ raise NotImplementedError class TokenChunkingStrategy(ChunkingStrategy): """Chunk text by token windows using the tiktoken encoder. The strategy operates on token counts: each chunk contains up to chunk_size tokens with overlap of overlap tokens between consecutive chunks. """ def __init__(self, chunk_size: int = 1000, overlap: int = 200, encoding_name: str = "cl100k_base"): if chunk_size <= 0: raise ValueError("chunk_size must be positive") if overlap < 0: raise ValueError("overlap must be non-negative") self.chunk_size = chunk_size self.overlap = overlap # Lazy import to avoid import-time penalties in environments without tokenizers import tiktoken self._encoding = tiktoken.get_encoding(encoding_name) def chunk(self, text: str) -> List[str]: if not isinstance(text, str): raise TypeError("text must be a string") if text.strip() == "": return [] # Tokenize the input text tokens = self._encoding.encode(text) if not tokens: return [] chunks: List[str] = [] step = self.chunk_size - self.overlap if step <= 0: step = 1 # ensure progress even with extreme overlap for i in range(0, len(tokens), step): segment = tokens[i : i + self.chunk_size] if not segment: break chunk_text = self._encoding.decode(segment) chunks.append(chunk_text) # If we reached the end of the token array, break early if len(segment) < self.chunk_size: break return chunks def chunk_pages( self, pages: List[Tuple[int, str]], overlap_tokens: int = 200 ) -> List[Tuple[str, int]]: """Chunk page-segmented text with overlap from adjacent pages. For each page, creates one chunk containing: [last overlap_tokens of previous page] + [full current page] + [first overlap_tokens of next page] One chunk per page — never splits a page even if oversized. The page_number metadata always refers to the main page (N), not overlap pages. Args: pages: List of (page_number, page_text) tuples. 1-indexed. overlap_tokens: Number of tokens to include from adjacent pages. Returns: List of (chunk_text, page_number) tuples. One chunk per page. """ if not pages: return [] tokenized: List[List[int]] = [ self._encoding.encode(text) for _, text in pages ] results: List[Tuple[str, int]] = [] for i, (page_num, page_text) in enumerate(pages): parts: List[str] = [] if i > 0 and overlap_tokens > 0: prev_tokens = tokenized[i - 1] overlap_before = prev_tokens[-overlap_tokens:] parts.append(self._encoding.decode(overlap_before)) parts.append(page_text) if i < len(pages) - 1 and overlap_tokens > 0: next_tokens = tokenized[i + 1] overlap_after = next_tokens[:overlap_tokens] parts.append(self._encoding.decode(overlap_after)) results.append(("\n".join(parts), page_num)) return results class QuestionChunkingStrategy(ChunkingStrategy): """Chunk text by detecting Q&A structure using LLM and/or regex patterns. Designed for LegCo documents with explicit 問/答 or Q1/Q2 markers. Falls back to section-based chunking for narrative-only documents. """ def __init__( self, settings: "Settings", llm_client: Optional["LLMClient"] = None, ): self._settings = settings self._llm_client = llm_client self._max_tokens = getattr(settings, "qa_max_chunk_tokens", 3000) self._chunk_metadata: List[dict] = [] def chunk(self, text: str) -> List[str]: """Split text into chunks using Q&A detection (for DOCX/TXT).""" if not text or not text.strip(): return [] from app.utils.qa_chunking import ( split_chinese_qa, split_english_qa, build_chunks_from_sections, Section, ) sections = split_chinese_qa(text) if not sections: sections = split_english_qa(text) if not sections: sections = [Section(type="narrative", content=text, start_page=1, end_page=1)] results = build_chunks_from_sections(sections, max_tokens=self._max_tokens) self._chunk_metadata = [meta for _, _, meta in results] return [chunk_text for chunk_text, _, _ in results] def chunk_pages( self, pages: List[Tuple[int, str]], overlap_tokens: int = 0 ) -> List[Tuple[str, int]]: """Split page-segmented text using Q&A detection (for PDF). Returns list of (chunk_text, page_number) where page_number references the question location for Q&A chunks. """ if not pages: return [] from app.utils.qa_chunking import ( preprocess_text, split_chinese_qa, split_english_qa, build_chunks_from_sections, parse_llm_structure_response, build_structure_detection_prompt, Section, ) full_text = preprocess_text(pages) sections = split_chinese_qa(full_text) if not sections: sections = split_english_qa(full_text) if not sections and self._llm_client is not None: import asyncio prompt = build_structure_detection_prompt(full_text) try: loop = asyncio.get_event_loop() if loop.is_running(): sections = [] else: response = loop.run_until_complete( self._llm_client.complete(prompt, temperature=0.3, step_name="StructureDetection") ) sections = parse_llm_structure_response(response) except Exception: logger.warning("LLM structure detection failed, using fallback", exc_info=True) if not sections: sections = [Section(type="narrative", content=full_text, start_page=1, end_page=len(pages))] results = build_chunks_from_sections(sections, max_tokens=self._max_tokens) self._chunk_metadata = [meta for _, _, meta in results] return [(chunk_text, page_num) for chunk_text, page_num, _ in results] def get_chunking_strategy(name: str, settings: "Settings") -> ChunkingStrategy: """Factory: return the named chunking strategy. Args: name: "token" or "question" settings: Application settings instance. Returns: ChunkingStrategy instance. """ if name == "question": return QuestionChunkingStrategy(settings=settings) return TokenChunkingStrategy( chunk_size=settings.chunk_size, overlap=settings.chunk_overlap, )