"""Phase 1 tests: Document chunking utilities. This file drives Test-First development for the chunking subsystem: - Abstract base interface for chunking strategies - Concrete TokenChunkingStrategy backed by tiktoken - Edge cases: empty input, whitespace-only input, small input """ import importlib.util from pathlib import Path import pytest # Dynamically load the chunking module directly from the filesystem to avoid # import path issues in the test environment. CHUNKING_PATH = Path(__file__).resolve().parents[1] / "utils" / "chunking.py" spec = importlib.util.spec_from_file_location("legco_chunking", str(CHUNKING_PATH)) chunking_module = importlib.util.module_from_spec(spec) # type: ignore assert spec and spec.loader spec.loader.exec_module(chunking_module) # type: ignore ChunkingStrategy = chunking_module.ChunkingStrategy TokenChunkingStrategy = chunking_module.TokenChunkingStrategy def test_abstract_base_class_not_instantiable(): # Abstract base class should not be instantiable directly with pytest.raises(TypeError): ChunkingStrategy() # type: ignore def test_empty_and_whitespace_inputs_yield_no_chunks(): strat = TokenChunkingStrategy() assert strat.chunk("") == [] assert strat.chunk(" \n\t") == [] def test_text_shorter_than_chunk_size_results_in_single_chunk(): # Use a small chunk size for a deterministic test strat = TokenChunkingStrategy(chunk_size=4, overlap=2) text = "Hello world" # two tokens in typical tokenization chunks = strat.chunk(text) assert isinstance(chunks, list) assert len(chunks) == 1 assert chunks[0] == text def test_text_longer_produces_multiple_chunks(): # Build a long sequence by repeating a simple token to ensure > chunk_size tokens long_text = ("word " * 1100).strip() strat = TokenChunkingStrategy(chunk_size=1000, overlap=200) chunks = strat.chunk(long_text) assert isinstance(chunks, list) assert len(chunks) >= 2 # Ensure chunks are non-empty and that the transformation round-trips for the first chunk assert all(isinstance(c, str) for c in chunks) assert all(len(c) > 0 for c in chunks)