legco_ai_assistant/backend/app/utils/chunking.py

120 lines
4.1 KiB
Python

"""Chunking utilities for Phase 1.2.
Provides an abstract ChunkingStrategy and a concrete
TokenChunkingStrategy that uses tiktoken to chunk text into
token-based windows.
"""
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import List, Tuple
class ChunkingStrategy(ABC):
"""Abstract base class for text chunking strategies."""
@abstractmethod
def chunk(self, text: str) -> List[str]:
"""Split text into a list of chunks (strings).
Implementations should return an empty list for empty or whitespace-only
input. The output chunks should be non-overlapping in terms of the produced
sequence when considering the token boundaries, but may overlap in raw text
due to token-based windowing.
"""
raise NotImplementedError
class TokenChunkingStrategy(ChunkingStrategy):
"""Chunk text by token windows using the tiktoken encoder.
The strategy operates on token counts: each chunk contains up to
chunk_size tokens with overlap of overlap tokens between consecutive chunks.
"""
def __init__(self, chunk_size: int = 1000, overlap: int = 200, encoding_name: str = "cl100k_base"):
if chunk_size <= 0:
raise ValueError("chunk_size must be positive")
if overlap < 0:
raise ValueError("overlap must be non-negative")
self.chunk_size = chunk_size
self.overlap = overlap
# Lazy import to avoid import-time penalties in environments without tokenizers
import tiktoken
self._encoding = tiktoken.get_encoding(encoding_name)
def chunk(self, text: str) -> List[str]:
if not isinstance(text, str):
raise TypeError("text must be a string")
if text.strip() == "":
return []
# Tokenize the input text
tokens = self._encoding.encode(text)
if not tokens:
return []
chunks: List[str] = []
step = self.chunk_size - self.overlap
if step <= 0:
step = 1 # ensure progress even with extreme overlap
for i in range(0, len(tokens), step):
segment = tokens[i : i + self.chunk_size]
if not segment:
break
chunk_text = self._encoding.decode(segment)
chunks.append(chunk_text)
# If we reached the end of the token array, break early
if len(segment) < self.chunk_size:
break
return chunks
def chunk_pages(
self, pages: List[Tuple[int, str]], overlap_tokens: int = 200
) -> List[Tuple[str, int]]:
"""Chunk page-segmented text with overlap from adjacent pages.
For each page, creates one chunk containing:
[last overlap_tokens of previous page] + [full current page] + [first overlap_tokens of next page]
One chunk per page — never splits a page even if oversized.
The page_number metadata always refers to the main page (N), not overlap pages.
Args:
pages: List of (page_number, page_text) tuples. 1-indexed.
overlap_tokens: Number of tokens to include from adjacent pages.
Returns:
List of (chunk_text, page_number) tuples. One chunk per page.
"""
if not pages:
return []
tokenized: List[List[int]] = [
self._encoding.encode(text) for _, text in pages
]
results: List[Tuple[str, int]] = []
for i, (page_num, page_text) in enumerate(pages):
parts: List[str] = []
if i > 0 and overlap_tokens > 0:
prev_tokens = tokenized[i - 1]
overlap_before = prev_tokens[-overlap_tokens:]
parts.append(self._encoding.decode(overlap_before))
parts.append(page_text)
if i < len(pages) - 1 and overlap_tokens > 0:
next_tokens = tokenized[i + 1]
overlap_after = next_tokens[:overlap_tokens]
parts.append(self._encoding.decode(overlap_after))
results.append(("\n".join(parts), page_num))
return results