feat(backend): add PDF page extractor and chunk PDF storage config

New pdf_extractor.py with extract_page_as_pdf() and extract_pages_as_pdf() for extracting individual PDF pages as separate files. Adds document_chunk_path setting to config and document_chunk/ to .gitignore. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-04-24 10:52:57 +08:00 · 2026-04-24 10:52:57 +08:00 · 8c84062996
parent 20b2f2c267
commit 8c84062996
4 changed files with 304 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -45,6 +45,9 @@ Thumbs.db
 # ChromaDB
 chroma_db/
 # Chunk PDF storage
 document_chunk/
 # Backend logs
 backend/app/log/
 *.log
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@ -19,6 +19,9 @@ class Settings(BaseSettings):
    # ChromaDB
    chroma_db_path: str = "./chroma_db"
    # Chunk PDF storage (extracted PDF pages)
    document_chunk_path: str = "./document_chunk"
    # App configuration moved to settings for easier testing/configuration
    # Cross-origin settings and chunking parameters (Phase 1 plan)
    cors_origins: list[str] = ["http://localhost:5173", "http://localhost:3000"]
--- a/backend/app/test/test_phase1_pdf_extractor.py
+++ b/backend/app/test/test_phase1_pdf_extractor.py
@ -0,0 +1,197 @@
 """Phase 1.5.5a: PDF page extractor tests.
 Tests for extract_page_as_pdf() and extract_pages_as_pdf() which extract
 individual pages from a PDF and save them as separate single-page PDF files.
 These extracted "chunk PDFs" serve as clickable source references in RAG responses.
 """
 from io import BytesIO
 from pathlib import Path
 import pytest
 from pypdf import PdfReader
 from app.utils.pdf_extractor import extract_page_as_pdf, extract_pages_as_pdf
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 def _create_multipage_pdf(path: Path, page_texts: list[str]) -> None:
    """Create a multi-page PDF at *path* with the given text per page."""
    from reportlab.pdfgen import canvas
    buf = BytesIO()
    c = canvas.Canvas(buf)
    for text in page_texts:
        c.drawString(72, 720, text)
        c.showPage()
    c.save()
    path.write_bytes(buf.getvalue())
 def _page_count(pdf_path: str) -> int:
    """Return the number of pages in a PDF file."""
    return len(PdfReader(pdf_path).pages)
 def _page_text(pdf_path: str, page_index: int = 0) -> str:
    """Return extracted text from a single page (0-indexed)."""
    reader = PdfReader(pdf_path)
    return reader.pages[page_index].extract_text() or ""
 # ---------------------------------------------------------------------------
 # Fixture
 # ---------------------------------------------------------------------------
@pytest.fixture()
 def multipage_pdf(tmp_path: Path) -> str:
    """3-page PDF saved to tmp, returns its string path."""
    pdf_path = tmp_path / "source.pdf"
    _create_multipage_pdf(pdf_path, ["Alpha", "Bravo", "Charlie"])
    return str(pdf_path)
 # ---------------------------------------------------------------------------
 # extract_page_as_pdf
 # ---------------------------------------------------------------------------
 class TestExtractPageAsPdf:
    """Tests for extract_page_as_pdf()."""
    def test_valid_page_creates_single_page_pdf(self, tmp_path: Path, multipage_pdf: str):
        """Extracting page 2 produces a PDF with exactly 1 page."""
        out = str(tmp_path / "out.pdf")
        result = extract_page_as_pdf(multipage_pdf, 2, out)
        assert result == out
        assert _page_count(out) == 1
    def test_page_content_matches_source(self, tmp_path: Path, multipage_pdf: str):
        """Extracted page 1 contains 'Alpha', page 2 contains 'Bravo', etc."""
        out = str(tmp_path / "page1.pdf")
        extract_page_as_pdf(multipage_pdf, 1, out)
        assert "Alpha" in _page_text(out, 0)
        out2 = str(tmp_path / "page2.pdf")
        extract_page_as_pdf(multipage_pdf, 2, out2)
        assert "Bravo" in _page_text(out2, 0)
        out3 = str(tmp_path / "page3.pdf")
        extract_page_as_pdf(multipage_pdf, 3, out3)
        assert "Charlie" in _page_text(out3, 0)
    def test_page_out_of_range_raises_value_error(self, tmp_path: Path, multipage_pdf: str):
        """Page number beyond PDF length raises ValueError."""
        out = str(tmp_path / "never.pdf")
        with pytest.raises(ValueError, match="out of range"):
            extract_page_as_pdf(multipage_pdf, 99, out)
    def test_page_zero_raises_value_error(self, tmp_path: Path, multipage_pdf: str):
        """Page number 0 (pages are 1-indexed) raises ValueError."""
        out = str(tmp_path / "never.pdf")
        with pytest.raises(ValueError, match="out of range"):
            extract_page_as_pdf(multipage_pdf, 0, out)
    def test_negative_page_raises_value_error(self, tmp_path: Path, multipage_pdf: str):
        """Negative page number raises ValueError."""
        out = str(tmp_path / "never.pdf")
        with pytest.raises(ValueError, match="out of range"):
            extract_page_as_pdf(multipage_pdf, -1, out)
    def test_nonexistent_source_raises_file_not_found(self, tmp_path: Path):
        """Missing source file raises FileNotFoundError."""
        out = str(tmp_path / "out.pdf")
        with pytest.raises(FileNotFoundError):
            extract_page_as_pdf("/no/such/file.pdf", 1, out)
    def test_invalid_pdf_raises_value_error(self, tmp_path: Path):
        """Corrupt / non-PDF content raises ValueError."""
        bad = tmp_path / "bad.pdf"
        bad.write_bytes(b"not a pdf")
        out = str(tmp_path / "out.pdf")
        with pytest.raises(ValueError, match="Invalid PDF"):
            extract_page_as_pdf(str(bad), 1, out)
 # ---------------------------------------------------------------------------
 # extract_pages_as_pdf
 # ---------------------------------------------------------------------------
 class TestExtractPagesAsPdf:
    """Tests for extract_pages_as_pdf()."""
    def test_multiple_pages_creates_correct_files(
        self, tmp_path: Path, multipage_pdf: str
    ):
        """Extracting pages [1, 3] from a 3-page PDF creates 2 files."""
        out_dir = str(tmp_path / "pages")
        result = extract_pages_as_pdf(multipage_pdf, [1, 3], out_dir, "doc")
        assert len(result) == 2
        assert _page_count(str(Path(out_dir) / result[0])) == 1
        assert _page_count(str(Path(out_dir) / result[1])) == 1
    def test_naming_convention(self, tmp_path: Path, multipage_pdf: str):
        """Output files follow {stem}_page_{n}.pdf naming."""
        out_dir = str(tmp_path / "pages")
        result = extract_pages_as_pdf(
            multipage_pdf, [1, 3], out_dir, "NEC4 ACC"
        )
        assert "NEC4 ACC_page_1.pdf" in result
        assert "NEC4 ACC_page_3.pdf" in result
    def test_empty_page_numbers_returns_empty(self, tmp_path: Path, multipage_pdf: str):
        """Empty page_numbers list returns empty list and creates no files."""
        out_dir = str(tmp_path / "pages")
        result = extract_pages_as_pdf(multipage_pdf, [], out_dir, "doc")
        assert result == []
        # output dir may or may not be created for empty input — both are fine
    def test_creates_output_dir_if_missing(self, tmp_path: Path, multipage_pdf: str):
        """Function creates output_dir when it does not yet exist."""
        out_dir = str(tmp_path / "nested" / "deep" / "dir")
        assert not Path(out_dir).exists()
        result = extract_pages_as_pdf(multipage_pdf, [2], out_dir, "test")
        assert Path(out_dir).is_dir()
        assert len(result) == 1
    def test_returns_relative_filenames(self, tmp_path: Path, multipage_pdf: str):
        """Returned paths are just filenames, not full paths."""
        out_dir = str(tmp_path / "pages")
        result = extract_pages_as_pdf(multipage_pdf, [1, 2], out_dir, "mydoc")
        for name in result:
            assert Path(name).name == name  # no directory components
    def test_out_of_range_page_raises_value_error(
        self, tmp_path: Path, multipage_pdf: str
    ):
        """Any page number out of range raises ValueError."""
        out_dir = str(tmp_path / "pages")
        with pytest.raises(ValueError, match="out of range"):
            extract_pages_as_pdf(multipage_pdf, [1, 50], out_dir, "doc")
    def test_nonexistent_source_raises_file_not_found(self, tmp_path: Path):
        """Missing source file raises FileNotFoundError."""
        out_dir = str(tmp_path / "pages")
        with pytest.raises(FileNotFoundError):
            extract_pages_as_pdf("/no/such/file.pdf", [1], out_dir, "doc")
    def test_invalid_pdf_raises_value_error(self, tmp_path: Path):
        """Corrupt source raises ValueError."""
        bad = tmp_path / "bad.pdf"
        bad.write_bytes(b"garbage")
        out_dir = str(tmp_path / "pages")
        with pytest.raises(ValueError, match="Invalid PDF"):
            extract_pages_as_pdf(str(bad), [1], out_dir, "doc")
    def test_extracted_content_correct(self, tmp_path: Path, multipage_pdf: str):
        """Each extracted file contains the right page text."""
        out_dir = str(tmp_path / "pages")
        result = extract_pages_as_pdf(
            multipage_pdf, [1, 3], out_dir, "content_test"
        )
        p1 = str(Path(out_dir) / result[0])
        p3 = str(Path(out_dir) / result[1])
        assert "Alpha" in _page_text(p1, 0)
        assert "Charlie" in _page_text(p3, 0)
--- a/backend/app/utils/pdf_extractor.py
+++ b/backend/app/utils/pdf_extractor.py
@ -0,0 +1,101 @@
 import os
 from typing import List
 from pypdf import PdfReader, PdfWriter
 def extract_page_as_pdf(source_path: str, page_number: int, output_path: str) -> str:
    """Extract a single page from a PDF and save as a new PDF file.
    Args:
        source_path: Path to original PDF file.
        page_number: 1-indexed page number to extract.
        output_path: Where to save the extracted page PDF.
    Returns:
        The output_path of the saved PDF file.
    Raises:
        FileNotFoundError: If source_path does not exist.
        ValueError: If source is not a valid PDF or page_number is out of range.
    """
    if not os.path.exists(source_path):
        raise FileNotFoundError(f"Source file not found: {source_path}")
    try:
        reader = PdfReader(source_path)
    except Exception as exc:
        raise ValueError(f"Invalid PDF file: {exc}") from exc
    total = len(reader.pages)
    if page_number < 1 or page_number > total:
        raise ValueError(
            f"Page number {page_number} out of range (1–{total})"
        )
    writer = PdfWriter()
    writer.add_page(reader.pages[page_number - 1])
    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
    with open(output_path, "wb") as f:
        writer.write(f)
    return output_path
 def extract_pages_as_pdf(
    source_path: str,
    page_numbers: List[int],
    output_dir: str,
    filename_stem: str,
 ) -> List[str]:
    """Extract multiple pages from a PDF, saving each as a separate PDF.
    Naming convention: {filename_stem}_page_{page_number}.pdf
    Args:
        source_path: Path to original PDF file.
        page_numbers: List of 1-indexed page numbers to extract.
        output_dir: Directory to save extracted PDFs.
        filename_stem: Base name for output files (e.g. "NEC4 ACC").
    Returns:
        List of output file paths (relative to output_dir).
    Raises:
        FileNotFoundError: If source_path does not exist.
        ValueError: If source is not a valid PDF or any page_number is out of range.
    """
    if not page_numbers:
        return []
    if not os.path.exists(source_path):
        raise FileNotFoundError(f"Source file not found: {source_path}")
    try:
        reader = PdfReader(source_path)
    except Exception as exc:
        raise ValueError(f"Invalid PDF file: {exc}") from exc
    total = len(reader.pages)
    for pn in page_numbers:
        if pn < 1 or pn > total:
            raise ValueError(
                f"Page number {pn} out of range (1–{total})"
            )
    os.makedirs(output_dir, exist_ok=True)
    output_filenames: List[str] = []
    for pn in page_numbers:
        filename = f"{filename_stem}_page_{pn}.pdf"
        full_path = os.path.join(output_dir, filename)
        writer = PdfWriter()
        writer.add_page(reader.pages[pn - 1])
        with open(full_path, "wb") as f:
            writer.write(f)
        output_filenames.append(filename)
    return output_filenames