feat(backend): add page-aware chunking with adjacent-page overlap
Add chunk_pages() to TokenChunkingStrategy: one chunk per page with 200-token overlap from adjacent pages. Uses original page text for main content, decoded tokens for overlap. Never splits a page regardless of size. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
f4fa577fb0
commit
0995c685fa
|
|
@ -0,0 +1,201 @@
|
|||
"""Phase 1.5.4: Page-aware chunking tests.
|
||||
|
||||
Tests for TokenChunkingStrategy.chunk_pages() which creates one chunk per page
|
||||
with overlap context from adjacent pages.
|
||||
"""
|
||||
|
||||
import importlib.util
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
# Dynamically load the chunking module directly from the filesystem to avoid
|
||||
# import path issues in the test environment.
|
||||
CHUNKING_PATH = Path(__file__).resolve().parents[1] / "utils" / "chunking.py"
|
||||
spec = importlib.util.spec_from_file_location("legco_chunking", str(CHUNKING_PATH))
|
||||
chunking_module = importlib.util.module_from_spec(spec) # type: ignore
|
||||
assert spec and spec.loader
|
||||
spec.loader.exec_module(chunking_module) # type: ignore
|
||||
TokenChunkingStrategy = chunking_module.TokenChunkingStrategy
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_strategy() -> TokenChunkingStrategy:
|
||||
return TokenChunkingStrategy(chunk_size=1000, overlap=200)
|
||||
|
||||
|
||||
def _long_text(topic: str, min_tokens: int = 300) -> str:
|
||||
"""Generate text with a unique topic marker and enough tokens to exceed min_tokens."""
|
||||
# Each word is roughly 1 token; add plenty of margin.
|
||||
return f"[{topic}] " + " ".join(f"{topic}-word{i}" for i in range(min_tokens))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_chunk_pages_basic():
|
||||
"""3 pages → 3 chunks, one per page, each contains main page text."""
|
||||
strat = _make_strategy()
|
||||
pages = [
|
||||
(1, _long_text("alpha")),
|
||||
(2, _long_text("beta")),
|
||||
(3, _long_text("gamma")),
|
||||
]
|
||||
result = strat.chunk_pages(pages)
|
||||
|
||||
assert len(result) == 3
|
||||
# Each result is (chunk_text, page_number)
|
||||
for i, (chunk_text, page_num) in enumerate(result):
|
||||
assert isinstance(chunk_text, str)
|
||||
assert page_num == pages[i][0]
|
||||
# Main page content must be present in the chunk
|
||||
assert pages[i][1] in chunk_text
|
||||
|
||||
|
||||
def test_chunk_pages_single_page():
|
||||
"""Single page returns single chunk with no overlap."""
|
||||
strat = _make_strategy()
|
||||
text = _long_text("solo")
|
||||
pages = [(1, text)]
|
||||
result = strat.chunk_pages(pages)
|
||||
|
||||
assert len(result) == 1
|
||||
chunk_text, page_num = result[0]
|
||||
assert page_num == 1
|
||||
assert text in chunk_text
|
||||
# No overlap content — chunk should be the original text (no extra newlines from joining)
|
||||
assert chunk_text.strip() == text.strip()
|
||||
|
||||
|
||||
def test_chunk_pages_first_page():
|
||||
"""First page gets overlap_after from page 2 but no overlap_before."""
|
||||
strat = _make_strategy()
|
||||
pages = [
|
||||
(1, _long_text("first")),
|
||||
(2, _long_text("second")),
|
||||
(3, _long_text("third")),
|
||||
]
|
||||
result = strat.chunk_pages(pages)
|
||||
|
||||
chunk_text, page_num = result[0]
|
||||
assert page_num == 1
|
||||
# Main content present
|
||||
assert pages[0][1] in chunk_text
|
||||
# Overlap from page 2 present
|
||||
assert "second" in chunk_text
|
||||
|
||||
|
||||
def test_chunk_pages_last_page():
|
||||
"""Last page gets overlap_before from page N-1 but no overlap_after."""
|
||||
strat = _make_strategy()
|
||||
pages = [
|
||||
(1, _long_text("first")),
|
||||
(2, _long_text("second")),
|
||||
(3, _long_text("third")),
|
||||
]
|
||||
result = strat.chunk_pages(pages)
|
||||
|
||||
chunk_text, page_num = result[-1]
|
||||
assert page_num == 3
|
||||
# Main content present
|
||||
assert pages[2][1] in chunk_text
|
||||
# Overlap from page 2 present
|
||||
assert "second" in chunk_text
|
||||
|
||||
|
||||
def test_chunk_pages_empty_input():
|
||||
"""Empty list returns empty list."""
|
||||
strat = _make_strategy()
|
||||
result = strat.chunk_pages([])
|
||||
assert result == []
|
||||
|
||||
|
||||
def test_chunk_pages_overlap_content():
|
||||
"""Verify overlap content comes from the correct adjacent pages.
|
||||
|
||||
Use distinct, recognizable text per page so we can assert that page N's
|
||||
chunk includes tokens from pages N-1 and N+1.
|
||||
"""
|
||||
strat = _make_strategy()
|
||||
pages = [
|
||||
(1, _long_text("page_one")),
|
||||
(2, _long_text("page_two")),
|
||||
(3, _long_text("page_three")),
|
||||
]
|
||||
result = strat.chunk_pages(pages)
|
||||
|
||||
# Page 2 chunk should contain overlap from both neighbors
|
||||
middle_chunk, middle_page = result[1]
|
||||
assert middle_page == 2
|
||||
assert "page_one" in middle_chunk
|
||||
assert "page_two" in middle_chunk
|
||||
assert "page_three" in middle_chunk
|
||||
|
||||
# Page 1 chunk: no page_one overlap before (it IS page 1), but has page_two overlap after
|
||||
first_chunk, _ = result[0]
|
||||
assert "page_two" in first_chunk
|
||||
# Should NOT contain page_three (that's two pages away)
|
||||
assert "page_three" not in first_chunk
|
||||
|
||||
# Page 3 chunk: has page_two overlap before, but no page_four after
|
||||
last_chunk, _ = result[2]
|
||||
assert "page_two" in last_chunk
|
||||
# Should NOT contain page_one (that's two pages away)
|
||||
assert "page_one" not in last_chunk
|
||||
|
||||
|
||||
def test_chunk_pages_returns_page_numbers():
|
||||
"""Verify page numbers are correctly preserved in output."""
|
||||
strat = _make_strategy()
|
||||
pages = [
|
||||
(5, _long_text("five")),
|
||||
(10, _long_text("ten")),
|
||||
(99, _long_text("ninety_nine")),
|
||||
]
|
||||
result = strat.chunk_pages(pages)
|
||||
|
||||
assert len(result) == 3
|
||||
output_pages = [pn for _, pn in result]
|
||||
assert output_pages == [5, 10, 99]
|
||||
|
||||
|
||||
def test_chunk_pages_custom_overlap():
|
||||
"""Test with non-default overlap_tokens value."""
|
||||
strat = _make_strategy()
|
||||
# Use very small overlap to verify it's respected
|
||||
pages = [
|
||||
(1, _long_text("aaa")),
|
||||
(2, _long_text("bbb")),
|
||||
]
|
||||
result = strat.chunk_pages(pages, overlap_tokens=5)
|
||||
|
||||
assert len(result) == 2
|
||||
# Both pages present
|
||||
assert result[0][1] == 1
|
||||
assert result[1][1] == 2
|
||||
# Page 1 should still have some overlap from page 2
|
||||
assert "bbb" in result[0][0]
|
||||
# Page 2 should still have some overlap from page 1
|
||||
assert "aaa" in result[1][0]
|
||||
|
||||
# Verify with zero overlap
|
||||
result_zero = strat.chunk_pages(pages, overlap_tokens=0)
|
||||
# Page 1 chunk should NOT contain page 2 content
|
||||
assert "bbb" not in result_zero[0][0]
|
||||
# Page 2 chunk should NOT contain page 1 content
|
||||
assert "aaa" not in result_zero[1][0]
|
||||
|
||||
|
||||
def test_chunk_pages_output_format():
|
||||
"""Each result element is a (str, int) tuple."""
|
||||
strat = _make_strategy()
|
||||
pages = [(1, "Short text one."), (2, "Short text two.")]
|
||||
result = strat.chunk_pages(pages)
|
||||
|
||||
for chunk_text, page_num in result:
|
||||
assert isinstance(chunk_text, str)
|
||||
assert isinstance(page_num, int)
|
||||
|
|
@ -7,7 +7,7 @@ token-based windows.
|
|||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List
|
||||
from typing import List, Tuple
|
||||
|
||||
|
||||
class ChunkingStrategy(ABC):
|
||||
|
|
@ -71,3 +71,49 @@ class TokenChunkingStrategy(ChunkingStrategy):
|
|||
break
|
||||
|
||||
return chunks
|
||||
|
||||
def chunk_pages(
|
||||
self, pages: List[Tuple[int, str]], overlap_tokens: int = 200
|
||||
) -> List[Tuple[str, int]]:
|
||||
"""Chunk page-segmented text with overlap from adjacent pages.
|
||||
|
||||
For each page, creates one chunk containing:
|
||||
[last overlap_tokens of previous page] + [full current page] + [first overlap_tokens of next page]
|
||||
|
||||
One chunk per page — never splits a page even if oversized.
|
||||
The page_number metadata always refers to the main page (N), not overlap pages.
|
||||
|
||||
Args:
|
||||
pages: List of (page_number, page_text) tuples. 1-indexed.
|
||||
overlap_tokens: Number of tokens to include from adjacent pages.
|
||||
|
||||
Returns:
|
||||
List of (chunk_text, page_number) tuples. One chunk per page.
|
||||
"""
|
||||
if not pages:
|
||||
return []
|
||||
|
||||
tokenized: List[List[int]] = [
|
||||
self._encoding.encode(text) for _, text in pages
|
||||
]
|
||||
|
||||
results: List[Tuple[str, int]] = []
|
||||
|
||||
for i, (page_num, page_text) in enumerate(pages):
|
||||
parts: List[str] = []
|
||||
|
||||
if i > 0 and overlap_tokens > 0:
|
||||
prev_tokens = tokenized[i - 1]
|
||||
overlap_before = prev_tokens[-overlap_tokens:]
|
||||
parts.append(self._encoding.decode(overlap_before))
|
||||
|
||||
parts.append(page_text)
|
||||
|
||||
if i < len(pages) - 1 and overlap_tokens > 0:
|
||||
next_tokens = tokenized[i + 1]
|
||||
overlap_after = next_tokens[:overlap_tokens]
|
||||
parts.append(self._encoding.decode(overlap_after))
|
||||
|
||||
results.append(("\n".join(parts), page_num))
|
||||
|
||||
return results
|
||||
|
|
|
|||
Loading…
Reference in New Issue