feat(backend): add page-aware chunking with adjacent-page overlap

Add chunk_pages() to TokenChunkingStrategy: one chunk per page with 200-token overlap from adjacent pages. Uses original page text for main content, decoded tokens for overlap. Never splits a page regardless of size. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-04-24 10:30:18 +08:00 · 2026-04-24 10:30:18 +08:00 · 0995c685fa
parent f4fa577fb0
commit 0995c685fa
2 changed files with 248 additions and 1 deletions
--- a/backend/app/test/test_phase1_page_aware_chunking.py
+++ b/backend/app/test/test_phase1_page_aware_chunking.py
@ -0,0 +1,201 @@
 """Phase 1.5.4: Page-aware chunking tests.
 Tests for TokenChunkingStrategy.chunk_pages() which creates one chunk per page
 with overlap context from adjacent pages.
 """
 import importlib.util
 from pathlib import Path
 import pytest
 # Dynamically load the chunking module directly from the filesystem to avoid
 # import path issues in the test environment.
 CHUNKING_PATH = Path(__file__).resolve().parents[1] / "utils" / "chunking.py"
 spec = importlib.util.spec_from_file_location("legco_chunking", str(CHUNKING_PATH))
 chunking_module = importlib.util.module_from_spec(spec)  # type: ignore
 assert spec and spec.loader
 spec.loader.exec_module(chunking_module)  # type: ignore
 TokenChunkingStrategy = chunking_module.TokenChunkingStrategy
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 def _make_strategy() -> TokenChunkingStrategy:
    return TokenChunkingStrategy(chunk_size=1000, overlap=200)
 def _long_text(topic: str, min_tokens: int = 300) -> str:
    """Generate text with a unique topic marker and enough tokens to exceed min_tokens."""
    # Each word is roughly 1 token; add plenty of margin.
    return f"[{topic}] " + " ".join(f"{topic}-word{i}" for i in range(min_tokens))
 # ---------------------------------------------------------------------------
 # Tests
 # ---------------------------------------------------------------------------
 def test_chunk_pages_basic():
    """3 pages → 3 chunks, one per page, each contains main page text."""
    strat = _make_strategy()
    pages = [
        (1, _long_text("alpha")),
        (2, _long_text("beta")),
        (3, _long_text("gamma")),
    ]
    result = strat.chunk_pages(pages)
    assert len(result) == 3
    # Each result is (chunk_text, page_number)
    for i, (chunk_text, page_num) in enumerate(result):
        assert isinstance(chunk_text, str)
        assert page_num == pages[i][0]
        # Main page content must be present in the chunk
        assert pages[i][1] in chunk_text
 def test_chunk_pages_single_page():
    """Single page returns single chunk with no overlap."""
    strat = _make_strategy()
    text = _long_text("solo")
    pages = [(1, text)]
    result = strat.chunk_pages(pages)
    assert len(result) == 1
    chunk_text, page_num = result[0]
    assert page_num == 1
    assert text in chunk_text
    # No overlap content — chunk should be the original text (no extra newlines from joining)
    assert chunk_text.strip() == text.strip()
 def test_chunk_pages_first_page():
    """First page gets overlap_after from page 2 but no overlap_before."""
    strat = _make_strategy()
    pages = [
        (1, _long_text("first")),
        (2, _long_text("second")),
        (3, _long_text("third")),
    ]
    result = strat.chunk_pages(pages)
    chunk_text, page_num = result[0]
    assert page_num == 1
    # Main content present
    assert pages[0][1] in chunk_text
    # Overlap from page 2 present
    assert "second" in chunk_text
 def test_chunk_pages_last_page():
    """Last page gets overlap_before from page N-1 but no overlap_after."""
    strat = _make_strategy()
    pages = [
        (1, _long_text("first")),
        (2, _long_text("second")),
        (3, _long_text("third")),
    ]
    result = strat.chunk_pages(pages)
    chunk_text, page_num = result[-1]
    assert page_num == 3
    # Main content present
    assert pages[2][1] in chunk_text
    # Overlap from page 2 present
    assert "second" in chunk_text
 def test_chunk_pages_empty_input():
    """Empty list returns empty list."""
    strat = _make_strategy()
    result = strat.chunk_pages([])
    assert result == []
 def test_chunk_pages_overlap_content():
    """Verify overlap content comes from the correct adjacent pages.
    Use distinct, recognizable text per page so we can assert that page N's
    chunk includes tokens from pages N-1 and N+1.
    """
    strat = _make_strategy()
    pages = [
        (1, _long_text("page_one")),
        (2, _long_text("page_two")),
        (3, _long_text("page_three")),
    ]
    result = strat.chunk_pages(pages)
    # Page 2 chunk should contain overlap from both neighbors
    middle_chunk, middle_page = result[1]
    assert middle_page == 2
    assert "page_one" in middle_chunk
    assert "page_two" in middle_chunk
    assert "page_three" in middle_chunk
    # Page 1 chunk: no page_one overlap before (it IS page 1), but has page_two overlap after
    first_chunk, _ = result[0]
    assert "page_two" in first_chunk
    # Should NOT contain page_three (that's two pages away)
    assert "page_three" not in first_chunk
    # Page 3 chunk: has page_two overlap before, but no page_four after
    last_chunk, _ = result[2]
    assert "page_two" in last_chunk
    # Should NOT contain page_one (that's two pages away)
    assert "page_one" not in last_chunk
 def test_chunk_pages_returns_page_numbers():
    """Verify page numbers are correctly preserved in output."""
    strat = _make_strategy()
    pages = [
        (5, _long_text("five")),
        (10, _long_text("ten")),
        (99, _long_text("ninety_nine")),
    ]
    result = strat.chunk_pages(pages)
    assert len(result) == 3
    output_pages = [pn for _, pn in result]
    assert output_pages == [5, 10, 99]
 def test_chunk_pages_custom_overlap():
    """Test with non-default overlap_tokens value."""
    strat = _make_strategy()
    # Use very small overlap to verify it's respected
    pages = [
        (1, _long_text("aaa")),
        (2, _long_text("bbb")),
    ]
    result = strat.chunk_pages(pages, overlap_tokens=5)
    assert len(result) == 2
    # Both pages present
    assert result[0][1] == 1
    assert result[1][1] == 2
    # Page 1 should still have some overlap from page 2
    assert "bbb" in result[0][0]
    # Page 2 should still have some overlap from page 1
    assert "aaa" in result[1][0]
    # Verify with zero overlap
    result_zero = strat.chunk_pages(pages, overlap_tokens=0)
    # Page 1 chunk should NOT contain page 2 content
    assert "bbb" not in result_zero[0][0]
    # Page 2 chunk should NOT contain page 1 content
    assert "aaa" not in result_zero[1][0]
 def test_chunk_pages_output_format():
    """Each result element is a (str, int) tuple."""
    strat = _make_strategy()
    pages = [(1, "Short text one."), (2, "Short text two.")]
    result = strat.chunk_pages(pages)
    for chunk_text, page_num in result:
        assert isinstance(chunk_text, str)
        assert isinstance(page_num, int)
--- a/backend/app/utils/chunking.py
+++ b/backend/app/utils/chunking.py
@ -7,7 +7,7 @@ token-based windows.
 from __future__ import annotations
 from abc import ABC, abstractmethod
-from typing import List
+from typing import List, Tuple
 class ChunkingStrategy(ABC):
@ -71,3 +71,49 @@ class TokenChunkingStrategy(ChunkingStrategy):
                break
        return chunks
    def chunk_pages(
        self, pages: List[Tuple[int, str]], overlap_tokens: int = 200
    ) -> List[Tuple[str, int]]:
        """Chunk page-segmented text with overlap from adjacent pages.
        For each page, creates one chunk containing:
          [last overlap_tokens of previous page] + [full current page] + [first overlap_tokens of next page]
        One chunk per page — never splits a page even if oversized.
        The page_number metadata always refers to the main page (N), not overlap pages.
        Args:
            pages: List of (page_number, page_text) tuples. 1-indexed.
            overlap_tokens: Number of tokens to include from adjacent pages.
        Returns:
            List of (chunk_text, page_number) tuples. One chunk per page.
        """
        if not pages:
            return []
        tokenized: List[List[int]] = [
            self._encoding.encode(text) for _, text in pages
        ]
        results: List[Tuple[str, int]] = []
        for i, (page_num, page_text) in enumerate(pages):
            parts: List[str] = []
            if i > 0 and overlap_tokens > 0:
                prev_tokens = tokenized[i - 1]
                overlap_before = prev_tokens[-overlap_tokens:]
                parts.append(self._encoding.decode(overlap_before))
            parts.append(page_text)
            if i < len(pages) - 1 and overlap_tokens > 0:
                next_tokens = tokenized[i + 1]
                overlap_after = next_tokens[:overlap_tokens]
                parts.append(self._encoding.decode(overlap_after))
            results.append(("\n".join(parts), page_num))
        return results