feat(backend): add page-aware chunking with adjacent-page overlap

Add chunk_pages() to TokenChunkingStrategy: one chunk per page with 200-token overlap from adjacent pages. Uses original page text for main content, decoded tokens for overlap. Never splits a page regardless of size.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
Woody 2026-04-24 10:30:18 +08:00
parent f4fa577fb0
commit 0995c685fa
2 changed files with 248 additions and 1 deletions

View File

@ -0,0 +1,201 @@
"""Phase 1.5.4: Page-aware chunking tests.
Tests for TokenChunkingStrategy.chunk_pages() which creates one chunk per page
with overlap context from adjacent pages.
"""
import importlib.util
from pathlib import Path
import pytest
# Dynamically load the chunking module directly from the filesystem to avoid
# import path issues in the test environment.
CHUNKING_PATH = Path(__file__).resolve().parents[1] / "utils" / "chunking.py"
spec = importlib.util.spec_from_file_location("legco_chunking", str(CHUNKING_PATH))
chunking_module = importlib.util.module_from_spec(spec) # type: ignore
assert spec and spec.loader
spec.loader.exec_module(chunking_module) # type: ignore
TokenChunkingStrategy = chunking_module.TokenChunkingStrategy
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_strategy() -> TokenChunkingStrategy:
return TokenChunkingStrategy(chunk_size=1000, overlap=200)
def _long_text(topic: str, min_tokens: int = 300) -> str:
"""Generate text with a unique topic marker and enough tokens to exceed min_tokens."""
# Each word is roughly 1 token; add plenty of margin.
return f"[{topic}] " + " ".join(f"{topic}-word{i}" for i in range(min_tokens))
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
def test_chunk_pages_basic():
"""3 pages → 3 chunks, one per page, each contains main page text."""
strat = _make_strategy()
pages = [
(1, _long_text("alpha")),
(2, _long_text("beta")),
(3, _long_text("gamma")),
]
result = strat.chunk_pages(pages)
assert len(result) == 3
# Each result is (chunk_text, page_number)
for i, (chunk_text, page_num) in enumerate(result):
assert isinstance(chunk_text, str)
assert page_num == pages[i][0]
# Main page content must be present in the chunk
assert pages[i][1] in chunk_text
def test_chunk_pages_single_page():
"""Single page returns single chunk with no overlap."""
strat = _make_strategy()
text = _long_text("solo")
pages = [(1, text)]
result = strat.chunk_pages(pages)
assert len(result) == 1
chunk_text, page_num = result[0]
assert page_num == 1
assert text in chunk_text
# No overlap content — chunk should be the original text (no extra newlines from joining)
assert chunk_text.strip() == text.strip()
def test_chunk_pages_first_page():
"""First page gets overlap_after from page 2 but no overlap_before."""
strat = _make_strategy()
pages = [
(1, _long_text("first")),
(2, _long_text("second")),
(3, _long_text("third")),
]
result = strat.chunk_pages(pages)
chunk_text, page_num = result[0]
assert page_num == 1
# Main content present
assert pages[0][1] in chunk_text
# Overlap from page 2 present
assert "second" in chunk_text
def test_chunk_pages_last_page():
"""Last page gets overlap_before from page N-1 but no overlap_after."""
strat = _make_strategy()
pages = [
(1, _long_text("first")),
(2, _long_text("second")),
(3, _long_text("third")),
]
result = strat.chunk_pages(pages)
chunk_text, page_num = result[-1]
assert page_num == 3
# Main content present
assert pages[2][1] in chunk_text
# Overlap from page 2 present
assert "second" in chunk_text
def test_chunk_pages_empty_input():
"""Empty list returns empty list."""
strat = _make_strategy()
result = strat.chunk_pages([])
assert result == []
def test_chunk_pages_overlap_content():
"""Verify overlap content comes from the correct adjacent pages.
Use distinct, recognizable text per page so we can assert that page N's
chunk includes tokens from pages N-1 and N+1.
"""
strat = _make_strategy()
pages = [
(1, _long_text("page_one")),
(2, _long_text("page_two")),
(3, _long_text("page_three")),
]
result = strat.chunk_pages(pages)
# Page 2 chunk should contain overlap from both neighbors
middle_chunk, middle_page = result[1]
assert middle_page == 2
assert "page_one" in middle_chunk
assert "page_two" in middle_chunk
assert "page_three" in middle_chunk
# Page 1 chunk: no page_one overlap before (it IS page 1), but has page_two overlap after
first_chunk, _ = result[0]
assert "page_two" in first_chunk
# Should NOT contain page_three (that's two pages away)
assert "page_three" not in first_chunk
# Page 3 chunk: has page_two overlap before, but no page_four after
last_chunk, _ = result[2]
assert "page_two" in last_chunk
# Should NOT contain page_one (that's two pages away)
assert "page_one" not in last_chunk
def test_chunk_pages_returns_page_numbers():
"""Verify page numbers are correctly preserved in output."""
strat = _make_strategy()
pages = [
(5, _long_text("five")),
(10, _long_text("ten")),
(99, _long_text("ninety_nine")),
]
result = strat.chunk_pages(pages)
assert len(result) == 3
output_pages = [pn for _, pn in result]
assert output_pages == [5, 10, 99]
def test_chunk_pages_custom_overlap():
"""Test with non-default overlap_tokens value."""
strat = _make_strategy()
# Use very small overlap to verify it's respected
pages = [
(1, _long_text("aaa")),
(2, _long_text("bbb")),
]
result = strat.chunk_pages(pages, overlap_tokens=5)
assert len(result) == 2
# Both pages present
assert result[0][1] == 1
assert result[1][1] == 2
# Page 1 should still have some overlap from page 2
assert "bbb" in result[0][0]
# Page 2 should still have some overlap from page 1
assert "aaa" in result[1][0]
# Verify with zero overlap
result_zero = strat.chunk_pages(pages, overlap_tokens=0)
# Page 1 chunk should NOT contain page 2 content
assert "bbb" not in result_zero[0][0]
# Page 2 chunk should NOT contain page 1 content
assert "aaa" not in result_zero[1][0]
def test_chunk_pages_output_format():
"""Each result element is a (str, int) tuple."""
strat = _make_strategy()
pages = [(1, "Short text one."), (2, "Short text two.")]
result = strat.chunk_pages(pages)
for chunk_text, page_num in result:
assert isinstance(chunk_text, str)
assert isinstance(page_num, int)

View File

@ -7,7 +7,7 @@ token-based windows.
from __future__ import annotations from __future__ import annotations
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List from typing import List, Tuple
class ChunkingStrategy(ABC): class ChunkingStrategy(ABC):
@ -71,3 +71,49 @@ class TokenChunkingStrategy(ChunkingStrategy):
break break
return chunks return chunks
def chunk_pages(
self, pages: List[Tuple[int, str]], overlap_tokens: int = 200
) -> List[Tuple[str, int]]:
"""Chunk page-segmented text with overlap from adjacent pages.
For each page, creates one chunk containing:
[last overlap_tokens of previous page] + [full current page] + [first overlap_tokens of next page]
One chunk per page never splits a page even if oversized.
The page_number metadata always refers to the main page (N), not overlap pages.
Args:
pages: List of (page_number, page_text) tuples. 1-indexed.
overlap_tokens: Number of tokens to include from adjacent pages.
Returns:
List of (chunk_text, page_number) tuples. One chunk per page.
"""
if not pages:
return []
tokenized: List[List[int]] = [
self._encoding.encode(text) for _, text in pages
]
results: List[Tuple[str, int]] = []
for i, (page_num, page_text) in enumerate(pages):
parts: List[str] = []
if i > 0 and overlap_tokens > 0:
prev_tokens = tokenized[i - 1]
overlap_before = prev_tokens[-overlap_tokens:]
parts.append(self._encoding.decode(overlap_before))
parts.append(page_text)
if i < len(pages) - 1 and overlap_tokens > 0:
next_tokens = tokenized[i + 1]
overlap_after = next_tokens[:overlap_tokens]
parts.append(self._encoding.decode(overlap_after))
results.append(("\n".join(parts), page_num))
return results