legco_ai_assistant/backend/app/utils/chunking.py

"""Chunking utilities for Phase 1.2.

Provides an abstract ChunkingStrategy and a concrete
TokenChunkingStrategy that uses tiktoken to chunk text into
token-based windows.
"""
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import List


class ChunkingStrategy(ABC):
    """Abstract base class for text chunking strategies."""

    @abstractmethod
    def chunk(self, text: str) -> List[str]:
        """Split text into a list of chunks (strings).

        Implementations should return an empty list for empty or whitespace-only
        input. The output chunks should be non-overlapping in terms of the produced
        sequence when considering the token boundaries, but may overlap in raw text
        due to token-based windowing.
        """
        raise NotImplementedError


class TokenChunkingStrategy(ChunkingStrategy):
    """Chunk text by token windows using the tiktoken encoder.

    The strategy operates on token counts: each chunk contains up to
    chunk_size tokens with overlap of overlap tokens between consecutive chunks.
    """

    def __init__(self, chunk_size: int = 1000, overlap: int = 200, encoding_name: str = "cl100k_base"):
        if chunk_size <= 0:
            raise ValueError("chunk_size must be positive")
        if overlap < 0:
            raise ValueError("overlap must be non-negative")
        self.chunk_size = chunk_size
        self.overlap = overlap
        # Lazy import to avoid import-time penalties in environments without tokenizers
        import tiktoken

        self._encoding = tiktoken.get_encoding(encoding_name)

    def chunk(self, text: str) -> List[str]:
        if not isinstance(text, str):
            raise TypeError("text must be a string")
        if text.strip() == "":
            return []

        # Tokenize the input text
        tokens = self._encoding.encode(text)
        if not tokens:
            return []

        chunks: List[str] = []
        step = self.chunk_size - self.overlap
        if step <= 0:
            step = 1  # ensure progress even with extreme overlap

        for i in range(0, len(tokens), step):
            segment = tokens[i : i + self.chunk_size]
            if not segment:
                break
            chunk_text = self._encoding.decode(segment)
            chunks.append(chunk_text)
            # If we reached the end of the token array, break early
            if len(segment) < self.chunk_size:
                break

        return chunks