feat(backend): add page-aware PDF parsing with per-page text extraction

Add parse_pdf_by_page() that returns List[Tuple[int, str]] with 1-indexed page numbers. Pages with no extractable text are skipped. Follows same error handling as existing parse_pdf().

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
Woody 2026-04-24 10:30:04 +08:00
parent 5dcb71369c
commit f4fa577fb0
2 changed files with 142 additions and 0 deletions

View File

@ -0,0 +1,114 @@
"""Phase 1.5.4: Page-aware PDF parser tests.
Tests for parse_pdf_by_page() which returns per-page text with page numbers.
"""
from io import BytesIO
import pytest
def test_parse_pdf_by_page_multipage(tmp_path):
"""Multi-page PDF returns list of (page_number, text) tuples, 1-indexed."""
try:
from reportlab.pdfgen import canvas
except ImportError:
pytest.skip("reportlab not installed")
pdf_path = tmp_path / "multipage.pdf"
buf = BytesIO()
c = canvas.Canvas(buf)
c.drawString(72, 720, "Page one content")
c.showPage()
c.drawString(72, 720, "Page two content")
c.showPage()
c.drawString(72, 720, "Page three content")
c.save()
pdf_path.write_bytes(buf.getvalue())
from app.utils.pdf_parser import parse_pdf_by_page
result = parse_pdf_by_page(str(pdf_path))
assert len(result) == 3
assert result[0][0] == 1
assert result[1][0] == 2
assert result[2][0] == 3
assert "Page one content" in result[0][1]
assert "Page two content" in result[1][1]
assert "Page three content" in result[2][1]
def test_parse_pdf_by_page_single_page(tmp_path):
"""Single-page PDF returns list with one tuple [(1, 'text')]."""
try:
from reportlab.pdfgen import canvas
except ImportError:
pytest.skip("reportlab not installed")
pdf_path = tmp_path / "single.pdf"
buf = BytesIO()
c = canvas.Canvas(buf)
c.drawString(72, 720, "Only page")
c.showPage()
c.save()
pdf_path.write_bytes(buf.getvalue())
from app.utils.pdf_parser import parse_pdf_by_page
result = parse_pdf_by_page(str(pdf_path))
assert len(result) == 1
assert result[0][0] == 1
assert "Only page" in result[0][1]
def test_parse_pdf_by_page_empty_pages_skipped(tmp_path):
"""Pages with no extractable text are excluded from results."""
try:
from reportlab.pdfgen import canvas
except ImportError:
pytest.skip("reportlab not installed")
pdf_path = tmp_path / "mixed.pdf"
buf = BytesIO()
c = canvas.Canvas(buf)
# Page 1: has text
c.drawString(72, 720, "Text page")
c.showPage()
# Page 2: blank (no text)
c.showPage()
# Page 3: has text
c.drawString(72, 720, "Another text page")
c.showPage()
c.save()
pdf_path.write_bytes(buf.getvalue())
from app.utils.pdf_parser import parse_pdf_by_page
result = parse_pdf_by_page(str(pdf_path))
# Only pages 1 and 3 should appear, but with their original page numbers
assert len(result) == 2
assert result[0][0] == 1
assert "Text page" in result[0][1]
assert result[1][0] == 3
assert "Another text page" in result[1][1]
def test_parse_pdf_by_page_empty_pdf(tmp_path):
"""Zero-byte PDF raises ValueError."""
pdf_path = tmp_path / "empty.pdf"
pdf_path.write_bytes(b"")
from app.utils.pdf_parser import parse_pdf_by_page
with pytest.raises(ValueError):
parse_pdf_by_page(str(pdf_path))
def test_parse_pdf_by_page_corrupted(tmp_path):
"""Invalid PDF content raises ValueError."""
pdf_path = tmp_path / "corrupted.pdf"
pdf_path.write_bytes(b"not a pdf content")
from app.utils.pdf_parser import parse_pdf_by_page
with pytest.raises(ValueError):
parse_pdf_by_page(str(pdf_path))

View File

@ -1,3 +1,5 @@
from typing import List, Tuple
from pypdf import PdfReader
@ -23,3 +25,29 @@ def parse_pdf(file_path: str) -> str:
raise ValueError(f"Failed to extract text from PDF: {exc}") from exc
return "\n".join(texts).strip()
def parse_pdf_by_page(file_path: str) -> List[Tuple[int, str]]:
"""Parse PDF and return per-page text with page numbers (1-indexed).
Returns list of (page_number, page_text) tuples. Pages with no
extractable text are skipped.
Raises:
ValueError: If file is not a valid PDF.
"""
try:
reader = PdfReader(file_path)
except Exception as exc:
raise ValueError(f"Invalid PDF file: {exc}") from exc
pages: List[Tuple[int, str]] = []
try:
for page_number, page in enumerate(reader.pages, start=1):
text = page.extract_text()
if text and text.strip():
pages.append((page_number, text.strip()))
except Exception as exc:
raise ValueError(f"Failed to extract text from PDF: {exc}") from exc
return pages