"""Chunking utilities for Phase 1.2. Provides an abstract ChunkingStrategy and a concrete TokenChunkingStrategy that uses tiktoken to chunk text into token-based windows. """ from __future__ import annotations from abc import ABC, abstractmethod from typing import List class ChunkingStrategy(ABC): """Abstract base class for text chunking strategies.""" @abstractmethod def chunk(self, text: str) -> List[str]: """Split text into a list of chunks (strings). Implementations should return an empty list for empty or whitespace-only input. The output chunks should be non-overlapping in terms of the produced sequence when considering the token boundaries, but may overlap in raw text due to token-based windowing. """ raise NotImplementedError class TokenChunkingStrategy(ChunkingStrategy): """Chunk text by token windows using the tiktoken encoder. The strategy operates on token counts: each chunk contains up to chunk_size tokens with overlap of overlap tokens between consecutive chunks. """ def __init__(self, chunk_size: int = 1000, overlap: int = 200, encoding_name: str = "cl100k_base"): if chunk_size <= 0: raise ValueError("chunk_size must be positive") if overlap < 0: raise ValueError("overlap must be non-negative") self.chunk_size = chunk_size self.overlap = overlap # Lazy import to avoid import-time penalties in environments without tokenizers import tiktoken self._encoding = tiktoken.get_encoding(encoding_name) def chunk(self, text: str) -> List[str]: if not isinstance(text, str): raise TypeError("text must be a string") if text.strip() == "": return [] # Tokenize the input text tokens = self._encoding.encode(text) if not tokens: return [] chunks: List[str] = [] step = self.chunk_size - self.overlap if step <= 0: step = 1 # ensure progress even with extreme overlap for i in range(0, len(tokens), step): segment = tokens[i : i + self.chunk_size] if not segment: break chunk_text = self._encoding.decode(segment) chunks.append(chunk_text) # If we reached the end of the token array, break early if len(segment) < self.chunk_size: break return chunks