From f4fa577fb009fae1f9fec93d70be938f938550b0 Mon Sep 17 00:00:00 2001 From: Woody Date: Fri, 24 Apr 2026 10:30:04 +0800 Subject: [PATCH] feat(backend): add page-aware PDF parsing with per-page text extraction Add parse_pdf_by_page() that returns List[Tuple[int, str]] with 1-indexed page numbers. Pages with no extractable text are skipped. Follows same error handling as existing parse_pdf(). Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus --- .../app/test/test_phase1_pdf_parser_pages.py | 114 ++++++++++++++++++ backend/app/utils/pdf_parser.py | 28 +++++ 2 files changed, 142 insertions(+) create mode 100644 backend/app/test/test_phase1_pdf_parser_pages.py diff --git a/backend/app/test/test_phase1_pdf_parser_pages.py b/backend/app/test/test_phase1_pdf_parser_pages.py new file mode 100644 index 0000000..3576a4f --- /dev/null +++ b/backend/app/test/test_phase1_pdf_parser_pages.py @@ -0,0 +1,114 @@ +"""Phase 1.5.4: Page-aware PDF parser tests. + +Tests for parse_pdf_by_page() which returns per-page text with page numbers. +""" +from io import BytesIO + +import pytest + + +def test_parse_pdf_by_page_multipage(tmp_path): + """Multi-page PDF returns list of (page_number, text) tuples, 1-indexed.""" + try: + from reportlab.pdfgen import canvas + except ImportError: + pytest.skip("reportlab not installed") + + pdf_path = tmp_path / "multipage.pdf" + buf = BytesIO() + c = canvas.Canvas(buf) + c.drawString(72, 720, "Page one content") + c.showPage() + c.drawString(72, 720, "Page two content") + c.showPage() + c.drawString(72, 720, "Page three content") + c.save() + pdf_path.write_bytes(buf.getvalue()) + + from app.utils.pdf_parser import parse_pdf_by_page + + result = parse_pdf_by_page(str(pdf_path)) + assert len(result) == 3 + assert result[0][0] == 1 + assert result[1][0] == 2 + assert result[2][0] == 3 + assert "Page one content" in result[0][1] + assert "Page two content" in result[1][1] + assert "Page three content" in result[2][1] + + +def test_parse_pdf_by_page_single_page(tmp_path): + """Single-page PDF returns list with one tuple [(1, 'text')].""" + try: + from reportlab.pdfgen import canvas + except ImportError: + pytest.skip("reportlab not installed") + + pdf_path = tmp_path / "single.pdf" + buf = BytesIO() + c = canvas.Canvas(buf) + c.drawString(72, 720, "Only page") + c.showPage() + c.save() + pdf_path.write_bytes(buf.getvalue()) + + from app.utils.pdf_parser import parse_pdf_by_page + + result = parse_pdf_by_page(str(pdf_path)) + assert len(result) == 1 + assert result[0][0] == 1 + assert "Only page" in result[0][1] + + +def test_parse_pdf_by_page_empty_pages_skipped(tmp_path): + """Pages with no extractable text are excluded from results.""" + try: + from reportlab.pdfgen import canvas + except ImportError: + pytest.skip("reportlab not installed") + + pdf_path = tmp_path / "mixed.pdf" + buf = BytesIO() + c = canvas.Canvas(buf) + # Page 1: has text + c.drawString(72, 720, "Text page") + c.showPage() + # Page 2: blank (no text) + c.showPage() + # Page 3: has text + c.drawString(72, 720, "Another text page") + c.showPage() + c.save() + pdf_path.write_bytes(buf.getvalue()) + + from app.utils.pdf_parser import parse_pdf_by_page + + result = parse_pdf_by_page(str(pdf_path)) + # Only pages 1 and 3 should appear, but with their original page numbers + assert len(result) == 2 + assert result[0][0] == 1 + assert "Text page" in result[0][1] + assert result[1][0] == 3 + assert "Another text page" in result[1][1] + + +def test_parse_pdf_by_page_empty_pdf(tmp_path): + """Zero-byte PDF raises ValueError.""" + pdf_path = tmp_path / "empty.pdf" + pdf_path.write_bytes(b"") + + from app.utils.pdf_parser import parse_pdf_by_page + + with pytest.raises(ValueError): + parse_pdf_by_page(str(pdf_path)) + + +def test_parse_pdf_by_page_corrupted(tmp_path): + """Invalid PDF content raises ValueError.""" + pdf_path = tmp_path / "corrupted.pdf" + pdf_path.write_bytes(b"not a pdf content") + + from app.utils.pdf_parser import parse_pdf_by_page + + with pytest.raises(ValueError): + parse_pdf_by_page(str(pdf_path)) diff --git a/backend/app/utils/pdf_parser.py b/backend/app/utils/pdf_parser.py index d9001f7..1edf954 100644 --- a/backend/app/utils/pdf_parser.py +++ b/backend/app/utils/pdf_parser.py @@ -1,3 +1,5 @@ +from typing import List, Tuple + from pypdf import PdfReader @@ -23,3 +25,29 @@ def parse_pdf(file_path: str) -> str: raise ValueError(f"Failed to extract text from PDF: {exc}") from exc return "\n".join(texts).strip() + + +def parse_pdf_by_page(file_path: str) -> List[Tuple[int, str]]: + """Parse PDF and return per-page text with page numbers (1-indexed). + + Returns list of (page_number, page_text) tuples. Pages with no + extractable text are skipped. + + Raises: + ValueError: If file is not a valid PDF. + """ + try: + reader = PdfReader(file_path) + except Exception as exc: + raise ValueError(f"Invalid PDF file: {exc}") from exc + + pages: List[Tuple[int, str]] = [] + try: + for page_number, page in enumerate(reader.pages, start=1): + text = page.extract_text() + if text and text.strip(): + pages.append((page_number, text.strip())) + except Exception as exc: + raise ValueError(f"Failed to extract text from PDF: {exc}") from exc + + return pages