feat(backend): add page-aware PDF parsing with per-page text extraction

Add parse_pdf_by_page() that returns List[Tuple[int, str]] with 1-indexed page numbers. Pages with no extractable text are skipped. Follows same error handling as existing parse_pdf(). Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-04-24 10:30:04 +08:00 · 2026-04-24 10:30:04 +08:00 · f4fa577fb0
parent 5dcb71369c
commit f4fa577fb0
2 changed files with 142 additions and 0 deletions
--- a/backend/app/test/test_phase1_pdf_parser_pages.py
+++ b/backend/app/test/test_phase1_pdf_parser_pages.py
@ -0,0 +1,114 @@
 """Phase 1.5.4: Page-aware PDF parser tests.
 Tests for parse_pdf_by_page() which returns per-page text with page numbers.
 """
 from io import BytesIO
 import pytest
 def test_parse_pdf_by_page_multipage(tmp_path):
    """Multi-page PDF returns list of (page_number, text) tuples, 1-indexed."""
    try:
        from reportlab.pdfgen import canvas
    except ImportError:
        pytest.skip("reportlab not installed")
    pdf_path = tmp_path / "multipage.pdf"
    buf = BytesIO()
    c = canvas.Canvas(buf)
    c.drawString(72, 720, "Page one content")
    c.showPage()
    c.drawString(72, 720, "Page two content")
    c.showPage()
    c.drawString(72, 720, "Page three content")
    c.save()
    pdf_path.write_bytes(buf.getvalue())
    from app.utils.pdf_parser import parse_pdf_by_page
    result = parse_pdf_by_page(str(pdf_path))
    assert len(result) == 3
    assert result[0][0] == 1
    assert result[1][0] == 2
    assert result[2][0] == 3
    assert "Page one content" in result[0][1]
    assert "Page two content" in result[1][1]
    assert "Page three content" in result[2][1]
 def test_parse_pdf_by_page_single_page(tmp_path):
    """Single-page PDF returns list with one tuple [(1, 'text')]."""
    try:
        from reportlab.pdfgen import canvas
    except ImportError:
        pytest.skip("reportlab not installed")
    pdf_path = tmp_path / "single.pdf"
    buf = BytesIO()
    c = canvas.Canvas(buf)
    c.drawString(72, 720, "Only page")
    c.showPage()
    c.save()
    pdf_path.write_bytes(buf.getvalue())
    from app.utils.pdf_parser import parse_pdf_by_page
    result = parse_pdf_by_page(str(pdf_path))
    assert len(result) == 1
    assert result[0][0] == 1
    assert "Only page" in result[0][1]
 def test_parse_pdf_by_page_empty_pages_skipped(tmp_path):
    """Pages with no extractable text are excluded from results."""
    try:
        from reportlab.pdfgen import canvas
    except ImportError:
        pytest.skip("reportlab not installed")
    pdf_path = tmp_path / "mixed.pdf"
    buf = BytesIO()
    c = canvas.Canvas(buf)
    # Page 1: has text
    c.drawString(72, 720, "Text page")
    c.showPage()
    # Page 2: blank (no text)
    c.showPage()
    # Page 3: has text
    c.drawString(72, 720, "Another text page")
    c.showPage()
    c.save()
    pdf_path.write_bytes(buf.getvalue())
    from app.utils.pdf_parser import parse_pdf_by_page
    result = parse_pdf_by_page(str(pdf_path))
    # Only pages 1 and 3 should appear, but with their original page numbers
    assert len(result) == 2
    assert result[0][0] == 1
    assert "Text page" in result[0][1]
    assert result[1][0] == 3
    assert "Another text page" in result[1][1]
 def test_parse_pdf_by_page_empty_pdf(tmp_path):
    """Zero-byte PDF raises ValueError."""
    pdf_path = tmp_path / "empty.pdf"
    pdf_path.write_bytes(b"")
    from app.utils.pdf_parser import parse_pdf_by_page
    with pytest.raises(ValueError):
        parse_pdf_by_page(str(pdf_path))
 def test_parse_pdf_by_page_corrupted(tmp_path):
    """Invalid PDF content raises ValueError."""
    pdf_path = tmp_path / "corrupted.pdf"
    pdf_path.write_bytes(b"not a pdf content")
    from app.utils.pdf_parser import parse_pdf_by_page
    with pytest.raises(ValueError):
        parse_pdf_by_page(str(pdf_path))
--- a/backend/app/utils/pdf_parser.py
+++ b/backend/app/utils/pdf_parser.py
@ -1,3 +1,5 @@
 from typing import List, Tuple
 from pypdf import PdfReader
@ -23,3 +25,29 @@ def parse_pdf(file_path: str) -> str:
        raise ValueError(f"Failed to extract text from PDF: {exc}") from exc
    return "\n".join(texts).strip()
 def parse_pdf_by_page(file_path: str) -> List[Tuple[int, str]]:
    """Parse PDF and return per-page text with page numbers (1-indexed).
    Returns list of (page_number, page_text) tuples. Pages with no
    extractable text are skipped.
    Raises:
        ValueError: If file is not a valid PDF.
    """
    try:
        reader = PdfReader(file_path)
    except Exception as exc:
        raise ValueError(f"Invalid PDF file: {exc}") from exc
    pages: List[Tuple[int, str]] = []
    try:
        for page_number, page in enumerate(reader.pages, start=1):
            text = page.extract_text()
            if text and text.strip():
                pages.append((page_number, text.strip()))
    except Exception as exc:
        raise ValueError(f"Failed to extract text from PDF: {exc}") from exc
    return pages