56 lines
2.1 KiB
Python
56 lines
2.1 KiB
Python
"""Phase 1 tests: Document chunking utilities.
|
|
|
|
This file drives Test-First development for the chunking subsystem:
|
|
- Abstract base interface for chunking strategies
|
|
- Concrete TokenChunkingStrategy backed by tiktoken
|
|
- Edge cases: empty input, whitespace-only input, small input
|
|
"""
|
|
|
|
import importlib.util
|
|
from pathlib import Path
|
|
import pytest
|
|
|
|
# Dynamically load the chunking module directly from the filesystem to avoid
|
|
# import path issues in the test environment.
|
|
CHUNKING_PATH = Path(__file__).resolve().parents[1] / "utils" / "chunking.py"
|
|
spec = importlib.util.spec_from_file_location("legco_chunking", str(CHUNKING_PATH))
|
|
chunking_module = importlib.util.module_from_spec(spec) # type: ignore
|
|
assert spec and spec.loader
|
|
spec.loader.exec_module(chunking_module) # type: ignore
|
|
ChunkingStrategy = chunking_module.ChunkingStrategy
|
|
TokenChunkingStrategy = chunking_module.TokenChunkingStrategy
|
|
|
|
|
|
def test_abstract_base_class_not_instantiable():
|
|
# Abstract base class should not be instantiable directly
|
|
with pytest.raises(TypeError):
|
|
ChunkingStrategy() # type: ignore
|
|
|
|
|
|
def test_empty_and_whitespace_inputs_yield_no_chunks():
|
|
strat = TokenChunkingStrategy()
|
|
assert strat.chunk("") == []
|
|
assert strat.chunk(" \n\t") == []
|
|
|
|
|
|
def test_text_shorter_than_chunk_size_results_in_single_chunk():
|
|
# Use a small chunk size for a deterministic test
|
|
strat = TokenChunkingStrategy(chunk_size=4, overlap=2)
|
|
text = "Hello world" # two tokens in typical tokenization
|
|
chunks = strat.chunk(text)
|
|
assert isinstance(chunks, list)
|
|
assert len(chunks) == 1
|
|
assert chunks[0] == text
|
|
|
|
|
|
def test_text_longer_produces_multiple_chunks():
|
|
# Build a long sequence by repeating a simple token to ensure > chunk_size tokens
|
|
long_text = ("word " * 1100).strip()
|
|
strat = TokenChunkingStrategy(chunk_size=1000, overlap=200)
|
|
chunks = strat.chunk(long_text)
|
|
assert isinstance(chunks, list)
|
|
assert len(chunks) >= 2
|
|
# Ensure chunks are non-empty and that the transformation round-trips for the first chunk
|
|
assert all(isinstance(c, str) for c in chunks)
|
|
assert all(len(c) > 0 for c in chunks)
|