74 lines
2.4 KiB
Python
74 lines
2.4 KiB
Python
"""Chunking utilities for Phase 1.2.
|
|
|
|
Provides an abstract ChunkingStrategy and a concrete
|
|
TokenChunkingStrategy that uses tiktoken to chunk text into
|
|
token-based windows.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from abc import ABC, abstractmethod
|
|
from typing import List
|
|
|
|
|
|
class ChunkingStrategy(ABC):
|
|
"""Abstract base class for text chunking strategies."""
|
|
|
|
@abstractmethod
|
|
def chunk(self, text: str) -> List[str]:
|
|
"""Split text into a list of chunks (strings).
|
|
|
|
Implementations should return an empty list for empty or whitespace-only
|
|
input. The output chunks should be non-overlapping in terms of the produced
|
|
sequence when considering the token boundaries, but may overlap in raw text
|
|
due to token-based windowing.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
|
|
class TokenChunkingStrategy(ChunkingStrategy):
|
|
"""Chunk text by token windows using the tiktoken encoder.
|
|
|
|
The strategy operates on token counts: each chunk contains up to
|
|
chunk_size tokens with overlap of overlap tokens between consecutive chunks.
|
|
"""
|
|
|
|
def __init__(self, chunk_size: int = 1000, overlap: int = 200, encoding_name: str = "cl100k_base"):
|
|
if chunk_size <= 0:
|
|
raise ValueError("chunk_size must be positive")
|
|
if overlap < 0:
|
|
raise ValueError("overlap must be non-negative")
|
|
self.chunk_size = chunk_size
|
|
self.overlap = overlap
|
|
# Lazy import to avoid import-time penalties in environments without tokenizers
|
|
import tiktoken
|
|
|
|
self._encoding = tiktoken.get_encoding(encoding_name)
|
|
|
|
def chunk(self, text: str) -> List[str]:
|
|
if not isinstance(text, str):
|
|
raise TypeError("text must be a string")
|
|
if text.strip() == "":
|
|
return []
|
|
|
|
# Tokenize the input text
|
|
tokens = self._encoding.encode(text)
|
|
if not tokens:
|
|
return []
|
|
|
|
chunks: List[str] = []
|
|
step = self.chunk_size - self.overlap
|
|
if step <= 0:
|
|
step = 1 # ensure progress even with extreme overlap
|
|
|
|
for i in range(0, len(tokens), step):
|
|
segment = tokens[i : i + self.chunk_size]
|
|
if not segment:
|
|
break
|
|
chunk_text = self._encoding.decode(segment)
|
|
chunks.append(chunk_text)
|
|
# If we reached the end of the token array, break early
|
|
if len(segment) < self.chunk_size:
|
|
break
|
|
|
|
return chunks
|