feat(backend): add PDF page extractor and chunk PDF storage config

New pdf_extractor.py with extract_page_as_pdf() and extract_pages_as_pdf() for extracting individual PDF pages as separate files. Adds document_chunk_path setting to config and document_chunk/ to .gitignore.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
Woody 2026-04-24 10:52:57 +08:00
parent 20b2f2c267
commit 8c84062996
4 changed files with 304 additions and 0 deletions

3
.gitignore vendored
View File

@ -45,6 +45,9 @@ Thumbs.db
# ChromaDB
chroma_db/
# Chunk PDF storage
document_chunk/
# Backend logs
backend/app/log/
*.log

View File

@ -19,6 +19,9 @@ class Settings(BaseSettings):
# ChromaDB
chroma_db_path: str = "./chroma_db"
# Chunk PDF storage (extracted PDF pages)
document_chunk_path: str = "./document_chunk"
# App configuration moved to settings for easier testing/configuration
# Cross-origin settings and chunking parameters (Phase 1 plan)
cors_origins: list[str] = ["http://localhost:5173", "http://localhost:3000"]

View File

@ -0,0 +1,197 @@
"""Phase 1.5.5a: PDF page extractor tests.
Tests for extract_page_as_pdf() and extract_pages_as_pdf() which extract
individual pages from a PDF and save them as separate single-page PDF files.
These extracted "chunk PDFs" serve as clickable source references in RAG responses.
"""
from io import BytesIO
from pathlib import Path
import pytest
from pypdf import PdfReader
from app.utils.pdf_extractor import extract_page_as_pdf, extract_pages_as_pdf
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _create_multipage_pdf(path: Path, page_texts: list[str]) -> None:
"""Create a multi-page PDF at *path* with the given text per page."""
from reportlab.pdfgen import canvas
buf = BytesIO()
c = canvas.Canvas(buf)
for text in page_texts:
c.drawString(72, 720, text)
c.showPage()
c.save()
path.write_bytes(buf.getvalue())
def _page_count(pdf_path: str) -> int:
"""Return the number of pages in a PDF file."""
return len(PdfReader(pdf_path).pages)
def _page_text(pdf_path: str, page_index: int = 0) -> str:
"""Return extracted text from a single page (0-indexed)."""
reader = PdfReader(pdf_path)
return reader.pages[page_index].extract_text() or ""
# ---------------------------------------------------------------------------
# Fixture
# ---------------------------------------------------------------------------
@pytest.fixture()
def multipage_pdf(tmp_path: Path) -> str:
"""3-page PDF saved to tmp, returns its string path."""
pdf_path = tmp_path / "source.pdf"
_create_multipage_pdf(pdf_path, ["Alpha", "Bravo", "Charlie"])
return str(pdf_path)
# ---------------------------------------------------------------------------
# extract_page_as_pdf
# ---------------------------------------------------------------------------
class TestExtractPageAsPdf:
"""Tests for extract_page_as_pdf()."""
def test_valid_page_creates_single_page_pdf(self, tmp_path: Path, multipage_pdf: str):
"""Extracting page 2 produces a PDF with exactly 1 page."""
out = str(tmp_path / "out.pdf")
result = extract_page_as_pdf(multipage_pdf, 2, out)
assert result == out
assert _page_count(out) == 1
def test_page_content_matches_source(self, tmp_path: Path, multipage_pdf: str):
"""Extracted page 1 contains 'Alpha', page 2 contains 'Bravo', etc."""
out = str(tmp_path / "page1.pdf")
extract_page_as_pdf(multipage_pdf, 1, out)
assert "Alpha" in _page_text(out, 0)
out2 = str(tmp_path / "page2.pdf")
extract_page_as_pdf(multipage_pdf, 2, out2)
assert "Bravo" in _page_text(out2, 0)
out3 = str(tmp_path / "page3.pdf")
extract_page_as_pdf(multipage_pdf, 3, out3)
assert "Charlie" in _page_text(out3, 0)
def test_page_out_of_range_raises_value_error(self, tmp_path: Path, multipage_pdf: str):
"""Page number beyond PDF length raises ValueError."""
out = str(tmp_path / "never.pdf")
with pytest.raises(ValueError, match="out of range"):
extract_page_as_pdf(multipage_pdf, 99, out)
def test_page_zero_raises_value_error(self, tmp_path: Path, multipage_pdf: str):
"""Page number 0 (pages are 1-indexed) raises ValueError."""
out = str(tmp_path / "never.pdf")
with pytest.raises(ValueError, match="out of range"):
extract_page_as_pdf(multipage_pdf, 0, out)
def test_negative_page_raises_value_error(self, tmp_path: Path, multipage_pdf: str):
"""Negative page number raises ValueError."""
out = str(tmp_path / "never.pdf")
with pytest.raises(ValueError, match="out of range"):
extract_page_as_pdf(multipage_pdf, -1, out)
def test_nonexistent_source_raises_file_not_found(self, tmp_path: Path):
"""Missing source file raises FileNotFoundError."""
out = str(tmp_path / "out.pdf")
with pytest.raises(FileNotFoundError):
extract_page_as_pdf("/no/such/file.pdf", 1, out)
def test_invalid_pdf_raises_value_error(self, tmp_path: Path):
"""Corrupt / non-PDF content raises ValueError."""
bad = tmp_path / "bad.pdf"
bad.write_bytes(b"not a pdf")
out = str(tmp_path / "out.pdf")
with pytest.raises(ValueError, match="Invalid PDF"):
extract_page_as_pdf(str(bad), 1, out)
# ---------------------------------------------------------------------------
# extract_pages_as_pdf
# ---------------------------------------------------------------------------
class TestExtractPagesAsPdf:
"""Tests for extract_pages_as_pdf()."""
def test_multiple_pages_creates_correct_files(
self, tmp_path: Path, multipage_pdf: str
):
"""Extracting pages [1, 3] from a 3-page PDF creates 2 files."""
out_dir = str(tmp_path / "pages")
result = extract_pages_as_pdf(multipage_pdf, [1, 3], out_dir, "doc")
assert len(result) == 2
assert _page_count(str(Path(out_dir) / result[0])) == 1
assert _page_count(str(Path(out_dir) / result[1])) == 1
def test_naming_convention(self, tmp_path: Path, multipage_pdf: str):
"""Output files follow {stem}_page_{n}.pdf naming."""
out_dir = str(tmp_path / "pages")
result = extract_pages_as_pdf(
multipage_pdf, [1, 3], out_dir, "NEC4 ACC"
)
assert "NEC4 ACC_page_1.pdf" in result
assert "NEC4 ACC_page_3.pdf" in result
def test_empty_page_numbers_returns_empty(self, tmp_path: Path, multipage_pdf: str):
"""Empty page_numbers list returns empty list and creates no files."""
out_dir = str(tmp_path / "pages")
result = extract_pages_as_pdf(multipage_pdf, [], out_dir, "doc")
assert result == []
# output dir may or may not be created for empty input — both are fine
def test_creates_output_dir_if_missing(self, tmp_path: Path, multipage_pdf: str):
"""Function creates output_dir when it does not yet exist."""
out_dir = str(tmp_path / "nested" / "deep" / "dir")
assert not Path(out_dir).exists()
result = extract_pages_as_pdf(multipage_pdf, [2], out_dir, "test")
assert Path(out_dir).is_dir()
assert len(result) == 1
def test_returns_relative_filenames(self, tmp_path: Path, multipage_pdf: str):
"""Returned paths are just filenames, not full paths."""
out_dir = str(tmp_path / "pages")
result = extract_pages_as_pdf(multipage_pdf, [1, 2], out_dir, "mydoc")
for name in result:
assert Path(name).name == name # no directory components
def test_out_of_range_page_raises_value_error(
self, tmp_path: Path, multipage_pdf: str
):
"""Any page number out of range raises ValueError."""
out_dir = str(tmp_path / "pages")
with pytest.raises(ValueError, match="out of range"):
extract_pages_as_pdf(multipage_pdf, [1, 50], out_dir, "doc")
def test_nonexistent_source_raises_file_not_found(self, tmp_path: Path):
"""Missing source file raises FileNotFoundError."""
out_dir = str(tmp_path / "pages")
with pytest.raises(FileNotFoundError):
extract_pages_as_pdf("/no/such/file.pdf", [1], out_dir, "doc")
def test_invalid_pdf_raises_value_error(self, tmp_path: Path):
"""Corrupt source raises ValueError."""
bad = tmp_path / "bad.pdf"
bad.write_bytes(b"garbage")
out_dir = str(tmp_path / "pages")
with pytest.raises(ValueError, match="Invalid PDF"):
extract_pages_as_pdf(str(bad), [1], out_dir, "doc")
def test_extracted_content_correct(self, tmp_path: Path, multipage_pdf: str):
"""Each extracted file contains the right page text."""
out_dir = str(tmp_path / "pages")
result = extract_pages_as_pdf(
multipage_pdf, [1, 3], out_dir, "content_test"
)
p1 = str(Path(out_dir) / result[0])
p3 = str(Path(out_dir) / result[1])
assert "Alpha" in _page_text(p1, 0)
assert "Charlie" in _page_text(p3, 0)

View File

@ -0,0 +1,101 @@
import os
from typing import List
from pypdf import PdfReader, PdfWriter
def extract_page_as_pdf(source_path: str, page_number: int, output_path: str) -> str:
"""Extract a single page from a PDF and save as a new PDF file.
Args:
source_path: Path to original PDF file.
page_number: 1-indexed page number to extract.
output_path: Where to save the extracted page PDF.
Returns:
The output_path of the saved PDF file.
Raises:
FileNotFoundError: If source_path does not exist.
ValueError: If source is not a valid PDF or page_number is out of range.
"""
if not os.path.exists(source_path):
raise FileNotFoundError(f"Source file not found: {source_path}")
try:
reader = PdfReader(source_path)
except Exception as exc:
raise ValueError(f"Invalid PDF file: {exc}") from exc
total = len(reader.pages)
if page_number < 1 or page_number > total:
raise ValueError(
f"Page number {page_number} out of range (1{total})"
)
writer = PdfWriter()
writer.add_page(reader.pages[page_number - 1])
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
with open(output_path, "wb") as f:
writer.write(f)
return output_path
def extract_pages_as_pdf(
source_path: str,
page_numbers: List[int],
output_dir: str,
filename_stem: str,
) -> List[str]:
"""Extract multiple pages from a PDF, saving each as a separate PDF.
Naming convention: {filename_stem}_page_{page_number}.pdf
Args:
source_path: Path to original PDF file.
page_numbers: List of 1-indexed page numbers to extract.
output_dir: Directory to save extracted PDFs.
filename_stem: Base name for output files (e.g. "NEC4 ACC").
Returns:
List of output file paths (relative to output_dir).
Raises:
FileNotFoundError: If source_path does not exist.
ValueError: If source is not a valid PDF or any page_number is out of range.
"""
if not page_numbers:
return []
if not os.path.exists(source_path):
raise FileNotFoundError(f"Source file not found: {source_path}")
try:
reader = PdfReader(source_path)
except Exception as exc:
raise ValueError(f"Invalid PDF file: {exc}") from exc
total = len(reader.pages)
for pn in page_numbers:
if pn < 1 or pn > total:
raise ValueError(
f"Page number {pn} out of range (1{total})"
)
os.makedirs(output_dir, exist_ok=True)
output_filenames: List[str] = []
for pn in page_numbers:
filename = f"{filename_stem}_page_{pn}.pdf"
full_path = os.path.join(output_dir, filename)
writer = PdfWriter()
writer.add_page(reader.pages[pn - 1])
with open(full_path, "wb") as f:
writer.write(f)
output_filenames.append(filename)
return output_filenames