feat(backend): add page-aware PDF parsing with per-page text extraction

Add parse_pdf_by_page() that returns List[Tuple[int, str]] with 1-indexed page numbers. Pages with no extractable text are skipped. Follows same error handling as existing parse_pdf(). Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-04-24 10:30:04 +08:00 · 2026-04-24 10:30:04 +08:00 · f4fa577fb0
parent 5dcb71369c
commit f4fa577fb0
2 changed files with 142 additions and 0 deletions
--- a/backend/app/test/test_phase1_pdf_parser_pages.py
+++ b/backend/app/test/test_phase1_pdf_parser_pages.py
@ -0,0 +1,114 @@
+"""Phase 1.5.4: Page-aware PDF parser tests.
+
+Tests for parse_pdf_by_page() which returns per-page text with page numbers.
+"""
+from io import BytesIO
+
+import pytest
+
+
+def test_parse_pdf_by_page_multipage(tmp_path):
+    """Multi-page PDF returns list of (page_number, text) tuples, 1-indexed."""
+    try:
+        from reportlab.pdfgen import canvas
+    except ImportError:
+        pytest.skip("reportlab not installed")
+
+    pdf_path = tmp_path / "multipage.pdf"
+    buf = BytesIO()
+    c = canvas.Canvas(buf)
+    c.drawString(72, 720, "Page one content")
+    c.showPage()
+    c.drawString(72, 720, "Page two content")
+    c.showPage()
+    c.drawString(72, 720, "Page three content")
+    c.save()
+    pdf_path.write_bytes(buf.getvalue())
+
+    from app.utils.pdf_parser import parse_pdf_by_page
+
+    result = parse_pdf_by_page(str(pdf_path))
+    assert len(result) == 3
+    assert result[0][0] == 1
+    assert result[1][0] == 2
+    assert result[2][0] == 3
+    assert "Page one content" in result[0][1]
+    assert "Page two content" in result[1][1]
+    assert "Page three content" in result[2][1]
+
+
+def test_parse_pdf_by_page_single_page(tmp_path):
+    """Single-page PDF returns list with one tuple [(1, 'text')]."""
+    try:
+        from reportlab.pdfgen import canvas
+    except ImportError:
+        pytest.skip("reportlab not installed")
+
+    pdf_path = tmp_path / "single.pdf"
+    buf = BytesIO()
+    c = canvas.Canvas(buf)
+    c.drawString(72, 720, "Only page")
+    c.showPage()
+    c.save()
+    pdf_path.write_bytes(buf.getvalue())
+
+    from app.utils.pdf_parser import parse_pdf_by_page
+
+    result = parse_pdf_by_page(str(pdf_path))
+    assert len(result) == 1
+    assert result[0][0] == 1
+    assert "Only page" in result[0][1]
+
+
+def test_parse_pdf_by_page_empty_pages_skipped(tmp_path):
+    """Pages with no extractable text are excluded from results."""
+    try:
+        from reportlab.pdfgen import canvas
+    except ImportError:
+        pytest.skip("reportlab not installed")
+
+    pdf_path = tmp_path / "mixed.pdf"
+    buf = BytesIO()
+    c = canvas.Canvas(buf)
+    # Page 1: has text
+    c.drawString(72, 720, "Text page")
+    c.showPage()
+    # Page 2: blank (no text)
+    c.showPage()
+    # Page 3: has text
+    c.drawString(72, 720, "Another text page")
+    c.showPage()
+    c.save()
+    pdf_path.write_bytes(buf.getvalue())
+
+    from app.utils.pdf_parser import parse_pdf_by_page
+
+    result = parse_pdf_by_page(str(pdf_path))
+    # Only pages 1 and 3 should appear, but with their original page numbers
+    assert len(result) == 2
+    assert result[0][0] == 1
+    assert "Text page" in result[0][1]
+    assert result[1][0] == 3
+    assert "Another text page" in result[1][1]
+
+
+def test_parse_pdf_by_page_empty_pdf(tmp_path):
+    """Zero-byte PDF raises ValueError."""
+    pdf_path = tmp_path / "empty.pdf"
+    pdf_path.write_bytes(b"")
+
+    from app.utils.pdf_parser import parse_pdf_by_page
+
+    with pytest.raises(ValueError):
+        parse_pdf_by_page(str(pdf_path))
+
+
+def test_parse_pdf_by_page_corrupted(tmp_path):
+    """Invalid PDF content raises ValueError."""
+    pdf_path = tmp_path / "corrupted.pdf"
+    pdf_path.write_bytes(b"not a pdf content")
+
+    from app.utils.pdf_parser import parse_pdf_by_page
+
+    with pytest.raises(ValueError):
+        parse_pdf_by_page(str(pdf_path))
--- a/backend/app/utils/pdf_parser.py
+++ b/backend/app/utils/pdf_parser.py
@ -1,3 +1,5 @@
+from typing import List, Tuple
+
 from pypdf import PdfReader


@ -23,3 +25,29 @@ def parse_pdf(file_path: str) -> str:
        raise ValueError(f"Failed to extract text from PDF: {exc}") from exc

    return "\n".join(texts).strip()
+
+
+def parse_pdf_by_page(file_path: str) -> List[Tuple[int, str]]:
+    """Parse PDF and return per-page text with page numbers (1-indexed).
+
+    Returns list of (page_number, page_text) tuples. Pages with no
+    extractable text are skipped.
+
+    Raises:
+        ValueError: If file is not a valid PDF.
+    """
+    try:
+        reader = PdfReader(file_path)
+    except Exception as exc:
+        raise ValueError(f"Invalid PDF file: {exc}") from exc
+
+    pages: List[Tuple[int, str]] = []
+    try:
+        for page_number, page in enumerate(reader.pages, start=1):
+            text = page.extract_text()
+            if text and text.strip():
+                pages.append((page_number, text.strip()))
+    except Exception as exc:
+        raise ValueError(f"Failed to extract text from PDF: {exc}") from exc
+
+    return pages