feat(backend): add PDF page extractor and chunk PDF storage config

New pdf_extractor.py with extract_page_as_pdf() and extract_pages_as_pdf() for extracting individual PDF pages as separate files. Adds document_chunk_path setting to config and document_chunk/ to .gitignore. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-04-24 10:52:57 +08:00 · 2026-04-24 10:52:57 +08:00 · 8c84062996
parent 20b2f2c267
commit 8c84062996
4 changed files with 304 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -45,6 +45,9 @@ Thumbs.db
 # ChromaDB
 chroma_db/

+# Chunk PDF storage
+document_chunk/
+
 # Backend logs
 backend/app/log/
 *.log
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@ -19,6 +19,9 @@ class Settings(BaseSettings):
    # ChromaDB
    chroma_db_path: str = "./chroma_db"

+    # Chunk PDF storage (extracted PDF pages)
+    document_chunk_path: str = "./document_chunk"
+
    # App configuration moved to settings for easier testing/configuration
    # Cross-origin settings and chunking parameters (Phase 1 plan)
    cors_origins: list[str] = ["http://localhost:5173", "http://localhost:3000"]
--- a/backend/app/test/test_phase1_pdf_extractor.py
+++ b/backend/app/test/test_phase1_pdf_extractor.py
@ -0,0 +1,197 @@
+"""Phase 1.5.5a: PDF page extractor tests.
+
+Tests for extract_page_as_pdf() and extract_pages_as_pdf() which extract
+individual pages from a PDF and save them as separate single-page PDF files.
+
+These extracted "chunk PDFs" serve as clickable source references in RAG responses.
+"""
+from io import BytesIO
+from pathlib import Path
+
+import pytest
+from pypdf import PdfReader
+
+from app.utils.pdf_extractor import extract_page_as_pdf, extract_pages_as_pdf
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _create_multipage_pdf(path: Path, page_texts: list[str]) -> None:
+    """Create a multi-page PDF at *path* with the given text per page."""
+    from reportlab.pdfgen import canvas
+
+    buf = BytesIO()
+    c = canvas.Canvas(buf)
+    for text in page_texts:
+        c.drawString(72, 720, text)
+        c.showPage()
+    c.save()
+    path.write_bytes(buf.getvalue())
+
+
+def _page_count(pdf_path: str) -> int:
+    """Return the number of pages in a PDF file."""
+    return len(PdfReader(pdf_path).pages)
+
+
+def _page_text(pdf_path: str, page_index: int = 0) -> str:
+    """Return extracted text from a single page (0-indexed)."""
+    reader = PdfReader(pdf_path)
+    return reader.pages[page_index].extract_text() or ""
+
+
+# ---------------------------------------------------------------------------
+# Fixture
+# ---------------------------------------------------------------------------
+
+@pytest.fixture()
+def multipage_pdf(tmp_path: Path) -> str:
+    """3-page PDF saved to tmp, returns its string path."""
+    pdf_path = tmp_path / "source.pdf"
+    _create_multipage_pdf(pdf_path, ["Alpha", "Bravo", "Charlie"])
+    return str(pdf_path)
+
+
+# ---------------------------------------------------------------------------
+# extract_page_as_pdf
+# ---------------------------------------------------------------------------
+
+class TestExtractPageAsPdf:
+    """Tests for extract_page_as_pdf()."""
+
+    def test_valid_page_creates_single_page_pdf(self, tmp_path: Path, multipage_pdf: str):
+        """Extracting page 2 produces a PDF with exactly 1 page."""
+        out = str(tmp_path / "out.pdf")
+        result = extract_page_as_pdf(multipage_pdf, 2, out)
+        assert result == out
+        assert _page_count(out) == 1
+
+    def test_page_content_matches_source(self, tmp_path: Path, multipage_pdf: str):
+        """Extracted page 1 contains 'Alpha', page 2 contains 'Bravo', etc."""
+        out = str(tmp_path / "page1.pdf")
+        extract_page_as_pdf(multipage_pdf, 1, out)
+        assert "Alpha" in _page_text(out, 0)
+
+        out2 = str(tmp_path / "page2.pdf")
+        extract_page_as_pdf(multipage_pdf, 2, out2)
+        assert "Bravo" in _page_text(out2, 0)
+
+        out3 = str(tmp_path / "page3.pdf")
+        extract_page_as_pdf(multipage_pdf, 3, out3)
+        assert "Charlie" in _page_text(out3, 0)
+
+    def test_page_out_of_range_raises_value_error(self, tmp_path: Path, multipage_pdf: str):
+        """Page number beyond PDF length raises ValueError."""
+        out = str(tmp_path / "never.pdf")
+        with pytest.raises(ValueError, match="out of range"):
+            extract_page_as_pdf(multipage_pdf, 99, out)
+
+    def test_page_zero_raises_value_error(self, tmp_path: Path, multipage_pdf: str):
+        """Page number 0 (pages are 1-indexed) raises ValueError."""
+        out = str(tmp_path / "never.pdf")
+        with pytest.raises(ValueError, match="out of range"):
+            extract_page_as_pdf(multipage_pdf, 0, out)
+
+    def test_negative_page_raises_value_error(self, tmp_path: Path, multipage_pdf: str):
+        """Negative page number raises ValueError."""
+        out = str(tmp_path / "never.pdf")
+        with pytest.raises(ValueError, match="out of range"):
+            extract_page_as_pdf(multipage_pdf, -1, out)
+
+    def test_nonexistent_source_raises_file_not_found(self, tmp_path: Path):
+        """Missing source file raises FileNotFoundError."""
+        out = str(tmp_path / "out.pdf")
+        with pytest.raises(FileNotFoundError):
+            extract_page_as_pdf("/no/such/file.pdf", 1, out)
+
+    def test_invalid_pdf_raises_value_error(self, tmp_path: Path):
+        """Corrupt / non-PDF content raises ValueError."""
+        bad = tmp_path / "bad.pdf"
+        bad.write_bytes(b"not a pdf")
+        out = str(tmp_path / "out.pdf")
+        with pytest.raises(ValueError, match="Invalid PDF"):
+            extract_page_as_pdf(str(bad), 1, out)
+
+
+# ---------------------------------------------------------------------------
+# extract_pages_as_pdf
+# ---------------------------------------------------------------------------
+
+class TestExtractPagesAsPdf:
+    """Tests for extract_pages_as_pdf()."""
+
+    def test_multiple_pages_creates_correct_files(
+        self, tmp_path: Path, multipage_pdf: str
+    ):
+        """Extracting pages [1, 3] from a 3-page PDF creates 2 files."""
+        out_dir = str(tmp_path / "pages")
+        result = extract_pages_as_pdf(multipage_pdf, [1, 3], out_dir, "doc")
+        assert len(result) == 2
+        assert _page_count(str(Path(out_dir) / result[0])) == 1
+        assert _page_count(str(Path(out_dir) / result[1])) == 1
+
+    def test_naming_convention(self, tmp_path: Path, multipage_pdf: str):
+        """Output files follow {stem}_page_{n}.pdf naming."""
+        out_dir = str(tmp_path / "pages")
+        result = extract_pages_as_pdf(
+            multipage_pdf, [1, 3], out_dir, "NEC4 ACC"
+        )
+        assert "NEC4 ACC_page_1.pdf" in result
+        assert "NEC4 ACC_page_3.pdf" in result
+
+    def test_empty_page_numbers_returns_empty(self, tmp_path: Path, multipage_pdf: str):
+        """Empty page_numbers list returns empty list and creates no files."""
+        out_dir = str(tmp_path / "pages")
+        result = extract_pages_as_pdf(multipage_pdf, [], out_dir, "doc")
+        assert result == []
+        # output dir may or may not be created for empty input — both are fine
+
+    def test_creates_output_dir_if_missing(self, tmp_path: Path, multipage_pdf: str):
+        """Function creates output_dir when it does not yet exist."""
+        out_dir = str(tmp_path / "nested" / "deep" / "dir")
+        assert not Path(out_dir).exists()
+        result = extract_pages_as_pdf(multipage_pdf, [2], out_dir, "test")
+        assert Path(out_dir).is_dir()
+        assert len(result) == 1
+
+    def test_returns_relative_filenames(self, tmp_path: Path, multipage_pdf: str):
+        """Returned paths are just filenames, not full paths."""
+        out_dir = str(tmp_path / "pages")
+        result = extract_pages_as_pdf(multipage_pdf, [1, 2], out_dir, "mydoc")
+        for name in result:
+            assert Path(name).name == name  # no directory components
+
+    def test_out_of_range_page_raises_value_error(
+        self, tmp_path: Path, multipage_pdf: str
+    ):
+        """Any page number out of range raises ValueError."""
+        out_dir = str(tmp_path / "pages")
+        with pytest.raises(ValueError, match="out of range"):
+            extract_pages_as_pdf(multipage_pdf, [1, 50], out_dir, "doc")
+
+    def test_nonexistent_source_raises_file_not_found(self, tmp_path: Path):
+        """Missing source file raises FileNotFoundError."""
+        out_dir = str(tmp_path / "pages")
+        with pytest.raises(FileNotFoundError):
+            extract_pages_as_pdf("/no/such/file.pdf", [1], out_dir, "doc")
+
+    def test_invalid_pdf_raises_value_error(self, tmp_path: Path):
+        """Corrupt source raises ValueError."""
+        bad = tmp_path / "bad.pdf"
+        bad.write_bytes(b"garbage")
+        out_dir = str(tmp_path / "pages")
+        with pytest.raises(ValueError, match="Invalid PDF"):
+            extract_pages_as_pdf(str(bad), [1], out_dir, "doc")
+
+    def test_extracted_content_correct(self, tmp_path: Path, multipage_pdf: str):
+        """Each extracted file contains the right page text."""
+        out_dir = str(tmp_path / "pages")
+        result = extract_pages_as_pdf(
+            multipage_pdf, [1, 3], out_dir, "content_test"
+        )
+        p1 = str(Path(out_dir) / result[0])
+        p3 = str(Path(out_dir) / result[1])
+        assert "Alpha" in _page_text(p1, 0)
+        assert "Charlie" in _page_text(p3, 0)
--- a/backend/app/utils/pdf_extractor.py
+++ b/backend/app/utils/pdf_extractor.py
@ -0,0 +1,101 @@
+import os
+from typing import List
+
+from pypdf import PdfReader, PdfWriter
+
+
+def extract_page_as_pdf(source_path: str, page_number: int, output_path: str) -> str:
+    """Extract a single page from a PDF and save as a new PDF file.
+
+    Args:
+        source_path: Path to original PDF file.
+        page_number: 1-indexed page number to extract.
+        output_path: Where to save the extracted page PDF.
+
+    Returns:
+        The output_path of the saved PDF file.
+
+    Raises:
+        FileNotFoundError: If source_path does not exist.
+        ValueError: If source is not a valid PDF or page_number is out of range.
+    """
+    if not os.path.exists(source_path):
+        raise FileNotFoundError(f"Source file not found: {source_path}")
+
+    try:
+        reader = PdfReader(source_path)
+    except Exception as exc:
+        raise ValueError(f"Invalid PDF file: {exc}") from exc
+
+    total = len(reader.pages)
+    if page_number < 1 or page_number > total:
+        raise ValueError(
+            f"Page number {page_number} out of range (1–{total})"
+        )
+
+    writer = PdfWriter()
+    writer.add_page(reader.pages[page_number - 1])
+
+    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+    with open(output_path, "wb") as f:
+        writer.write(f)
+
+    return output_path
+
+
+def extract_pages_as_pdf(
+    source_path: str,
+    page_numbers: List[int],
+    output_dir: str,
+    filename_stem: str,
+) -> List[str]:
+    """Extract multiple pages from a PDF, saving each as a separate PDF.
+
+    Naming convention: {filename_stem}_page_{page_number}.pdf
+
+    Args:
+        source_path: Path to original PDF file.
+        page_numbers: List of 1-indexed page numbers to extract.
+        output_dir: Directory to save extracted PDFs.
+        filename_stem: Base name for output files (e.g. "NEC4 ACC").
+
+    Returns:
+        List of output file paths (relative to output_dir).
+
+    Raises:
+        FileNotFoundError: If source_path does not exist.
+        ValueError: If source is not a valid PDF or any page_number is out of range.
+    """
+    if not page_numbers:
+        return []
+
+    if not os.path.exists(source_path):
+        raise FileNotFoundError(f"Source file not found: {source_path}")
+
+    try:
+        reader = PdfReader(source_path)
+    except Exception as exc:
+        raise ValueError(f"Invalid PDF file: {exc}") from exc
+
+    total = len(reader.pages)
+    for pn in page_numbers:
+        if pn < 1 or pn > total:
+            raise ValueError(
+                f"Page number {pn} out of range (1–{total})"
+            )
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    output_filenames: List[str] = []
+    for pn in page_numbers:
+        filename = f"{filename_stem}_page_{pn}.pdf"
+        full_path = os.path.join(output_dir, filename)
+
+        writer = PdfWriter()
+        writer.add_page(reader.pages[pn - 1])
+        with open(full_path, "wb") as f:
+            writer.write(f)
+
+        output_filenames.append(filename)
+
+    return output_filenames