feat(backend): add page-aware PDF parsing with per-page text extraction
Add parse_pdf_by_page() that returns List[Tuple[int, str]] with 1-indexed page numbers. Pages with no extractable text are skipped. Follows same error handling as existing parse_pdf(). Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
5dcb71369c
commit
f4fa577fb0
|
|
@ -0,0 +1,114 @@
|
||||||
|
"""Phase 1.5.4: Page-aware PDF parser tests.
|
||||||
|
|
||||||
|
Tests for parse_pdf_by_page() which returns per-page text with page numbers.
|
||||||
|
"""
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_pdf_by_page_multipage(tmp_path):
|
||||||
|
"""Multi-page PDF returns list of (page_number, text) tuples, 1-indexed."""
|
||||||
|
try:
|
||||||
|
from reportlab.pdfgen import canvas
|
||||||
|
except ImportError:
|
||||||
|
pytest.skip("reportlab not installed")
|
||||||
|
|
||||||
|
pdf_path = tmp_path / "multipage.pdf"
|
||||||
|
buf = BytesIO()
|
||||||
|
c = canvas.Canvas(buf)
|
||||||
|
c.drawString(72, 720, "Page one content")
|
||||||
|
c.showPage()
|
||||||
|
c.drawString(72, 720, "Page two content")
|
||||||
|
c.showPage()
|
||||||
|
c.drawString(72, 720, "Page three content")
|
||||||
|
c.save()
|
||||||
|
pdf_path.write_bytes(buf.getvalue())
|
||||||
|
|
||||||
|
from app.utils.pdf_parser import parse_pdf_by_page
|
||||||
|
|
||||||
|
result = parse_pdf_by_page(str(pdf_path))
|
||||||
|
assert len(result) == 3
|
||||||
|
assert result[0][0] == 1
|
||||||
|
assert result[1][0] == 2
|
||||||
|
assert result[2][0] == 3
|
||||||
|
assert "Page one content" in result[0][1]
|
||||||
|
assert "Page two content" in result[1][1]
|
||||||
|
assert "Page three content" in result[2][1]
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_pdf_by_page_single_page(tmp_path):
|
||||||
|
"""Single-page PDF returns list with one tuple [(1, 'text')]."""
|
||||||
|
try:
|
||||||
|
from reportlab.pdfgen import canvas
|
||||||
|
except ImportError:
|
||||||
|
pytest.skip("reportlab not installed")
|
||||||
|
|
||||||
|
pdf_path = tmp_path / "single.pdf"
|
||||||
|
buf = BytesIO()
|
||||||
|
c = canvas.Canvas(buf)
|
||||||
|
c.drawString(72, 720, "Only page")
|
||||||
|
c.showPage()
|
||||||
|
c.save()
|
||||||
|
pdf_path.write_bytes(buf.getvalue())
|
||||||
|
|
||||||
|
from app.utils.pdf_parser import parse_pdf_by_page
|
||||||
|
|
||||||
|
result = parse_pdf_by_page(str(pdf_path))
|
||||||
|
assert len(result) == 1
|
||||||
|
assert result[0][0] == 1
|
||||||
|
assert "Only page" in result[0][1]
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_pdf_by_page_empty_pages_skipped(tmp_path):
|
||||||
|
"""Pages with no extractable text are excluded from results."""
|
||||||
|
try:
|
||||||
|
from reportlab.pdfgen import canvas
|
||||||
|
except ImportError:
|
||||||
|
pytest.skip("reportlab not installed")
|
||||||
|
|
||||||
|
pdf_path = tmp_path / "mixed.pdf"
|
||||||
|
buf = BytesIO()
|
||||||
|
c = canvas.Canvas(buf)
|
||||||
|
# Page 1: has text
|
||||||
|
c.drawString(72, 720, "Text page")
|
||||||
|
c.showPage()
|
||||||
|
# Page 2: blank (no text)
|
||||||
|
c.showPage()
|
||||||
|
# Page 3: has text
|
||||||
|
c.drawString(72, 720, "Another text page")
|
||||||
|
c.showPage()
|
||||||
|
c.save()
|
||||||
|
pdf_path.write_bytes(buf.getvalue())
|
||||||
|
|
||||||
|
from app.utils.pdf_parser import parse_pdf_by_page
|
||||||
|
|
||||||
|
result = parse_pdf_by_page(str(pdf_path))
|
||||||
|
# Only pages 1 and 3 should appear, but with their original page numbers
|
||||||
|
assert len(result) == 2
|
||||||
|
assert result[0][0] == 1
|
||||||
|
assert "Text page" in result[0][1]
|
||||||
|
assert result[1][0] == 3
|
||||||
|
assert "Another text page" in result[1][1]
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_pdf_by_page_empty_pdf(tmp_path):
|
||||||
|
"""Zero-byte PDF raises ValueError."""
|
||||||
|
pdf_path = tmp_path / "empty.pdf"
|
||||||
|
pdf_path.write_bytes(b"")
|
||||||
|
|
||||||
|
from app.utils.pdf_parser import parse_pdf_by_page
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
parse_pdf_by_page(str(pdf_path))
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_pdf_by_page_corrupted(tmp_path):
|
||||||
|
"""Invalid PDF content raises ValueError."""
|
||||||
|
pdf_path = tmp_path / "corrupted.pdf"
|
||||||
|
pdf_path.write_bytes(b"not a pdf content")
|
||||||
|
|
||||||
|
from app.utils.pdf_parser import parse_pdf_by_page
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
parse_pdf_by_page(str(pdf_path))
|
||||||
|
|
@ -1,3 +1,5 @@
|
||||||
|
from typing import List, Tuple
|
||||||
|
|
||||||
from pypdf import PdfReader
|
from pypdf import PdfReader
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -23,3 +25,29 @@ def parse_pdf(file_path: str) -> str:
|
||||||
raise ValueError(f"Failed to extract text from PDF: {exc}") from exc
|
raise ValueError(f"Failed to extract text from PDF: {exc}") from exc
|
||||||
|
|
||||||
return "\n".join(texts).strip()
|
return "\n".join(texts).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_pdf_by_page(file_path: str) -> List[Tuple[int, str]]:
|
||||||
|
"""Parse PDF and return per-page text with page numbers (1-indexed).
|
||||||
|
|
||||||
|
Returns list of (page_number, page_text) tuples. Pages with no
|
||||||
|
extractable text are skipped.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If file is not a valid PDF.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
reader = PdfReader(file_path)
|
||||||
|
except Exception as exc:
|
||||||
|
raise ValueError(f"Invalid PDF file: {exc}") from exc
|
||||||
|
|
||||||
|
pages: List[Tuple[int, str]] = []
|
||||||
|
try:
|
||||||
|
for page_number, page in enumerate(reader.pages, start=1):
|
||||||
|
text = page.extract_text()
|
||||||
|
if text and text.strip():
|
||||||
|
pages.append((page_number, text.strip()))
|
||||||
|
except Exception as exc:
|
||||||
|
raise ValueError(f"Failed to extract text from PDF: {exc}") from exc
|
||||||
|
|
||||||
|
return pages
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue