legco_ai_assistant/backend/app/test/test_phase1_chunking.py

56 lines
2.1 KiB
Python

"""Phase 1 tests: Document chunking utilities.
This file drives Test-First development for the chunking subsystem:
- Abstract base interface for chunking strategies
- Concrete TokenChunkingStrategy backed by tiktoken
- Edge cases: empty input, whitespace-only input, small input
"""
import importlib.util
from pathlib import Path
import pytest
# Dynamically load the chunking module directly from the filesystem to avoid
# import path issues in the test environment.
CHUNKING_PATH = Path(__file__).resolve().parents[1] / "utils" / "chunking.py"
spec = importlib.util.spec_from_file_location("legco_chunking", str(CHUNKING_PATH))
chunking_module = importlib.util.module_from_spec(spec) # type: ignore
assert spec and spec.loader
spec.loader.exec_module(chunking_module) # type: ignore
ChunkingStrategy = chunking_module.ChunkingStrategy
TokenChunkingStrategy = chunking_module.TokenChunkingStrategy
def test_abstract_base_class_not_instantiable():
# Abstract base class should not be instantiable directly
with pytest.raises(TypeError):
ChunkingStrategy() # type: ignore
def test_empty_and_whitespace_inputs_yield_no_chunks():
strat = TokenChunkingStrategy()
assert strat.chunk("") == []
assert strat.chunk(" \n\t") == []
def test_text_shorter_than_chunk_size_results_in_single_chunk():
# Use a small chunk size for a deterministic test
strat = TokenChunkingStrategy(chunk_size=4, overlap=2)
text = "Hello world" # two tokens in typical tokenization
chunks = strat.chunk(text)
assert isinstance(chunks, list)
assert len(chunks) == 1
assert chunks[0] == text
def test_text_longer_produces_multiple_chunks():
# Build a long sequence by repeating a simple token to ensure > chunk_size tokens
long_text = ("word " * 1100).strip()
strat = TokenChunkingStrategy(chunk_size=1000, overlap=200)
chunks = strat.chunk(long_text)
assert isinstance(chunks, list)
assert len(chunks) >= 2
# Ensure chunks are non-empty and that the transformation round-trips for the first chunk
assert all(isinstance(c, str) for c in chunks)
assert all(len(c) > 0 for c in chunks)