feat(backend): add PDF page extractor and chunk PDF storage config
New pdf_extractor.py with extract_page_as_pdf() and extract_pages_as_pdf() for extracting individual PDF pages as separate files. Adds document_chunk_path setting to config and document_chunk/ to .gitignore. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
20b2f2c267
commit
8c84062996
|
|
@ -45,6 +45,9 @@ Thumbs.db
|
||||||
# ChromaDB
|
# ChromaDB
|
||||||
chroma_db/
|
chroma_db/
|
||||||
|
|
||||||
|
# Chunk PDF storage
|
||||||
|
document_chunk/
|
||||||
|
|
||||||
# Backend logs
|
# Backend logs
|
||||||
backend/app/log/
|
backend/app/log/
|
||||||
*.log
|
*.log
|
||||||
|
|
|
||||||
|
|
@ -19,6 +19,9 @@ class Settings(BaseSettings):
|
||||||
# ChromaDB
|
# ChromaDB
|
||||||
chroma_db_path: str = "./chroma_db"
|
chroma_db_path: str = "./chroma_db"
|
||||||
|
|
||||||
|
# Chunk PDF storage (extracted PDF pages)
|
||||||
|
document_chunk_path: str = "./document_chunk"
|
||||||
|
|
||||||
# App configuration moved to settings for easier testing/configuration
|
# App configuration moved to settings for easier testing/configuration
|
||||||
# Cross-origin settings and chunking parameters (Phase 1 plan)
|
# Cross-origin settings and chunking parameters (Phase 1 plan)
|
||||||
cors_origins: list[str] = ["http://localhost:5173", "http://localhost:3000"]
|
cors_origins: list[str] = ["http://localhost:5173", "http://localhost:3000"]
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,197 @@
|
||||||
|
"""Phase 1.5.5a: PDF page extractor tests.
|
||||||
|
|
||||||
|
Tests for extract_page_as_pdf() and extract_pages_as_pdf() which extract
|
||||||
|
individual pages from a PDF and save them as separate single-page PDF files.
|
||||||
|
|
||||||
|
These extracted "chunk PDFs" serve as clickable source references in RAG responses.
|
||||||
|
"""
|
||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from pypdf import PdfReader
|
||||||
|
|
||||||
|
from app.utils.pdf_extractor import extract_page_as_pdf, extract_pages_as_pdf
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _create_multipage_pdf(path: Path, page_texts: list[str]) -> None:
|
||||||
|
"""Create a multi-page PDF at *path* with the given text per page."""
|
||||||
|
from reportlab.pdfgen import canvas
|
||||||
|
|
||||||
|
buf = BytesIO()
|
||||||
|
c = canvas.Canvas(buf)
|
||||||
|
for text in page_texts:
|
||||||
|
c.drawString(72, 720, text)
|
||||||
|
c.showPage()
|
||||||
|
c.save()
|
||||||
|
path.write_bytes(buf.getvalue())
|
||||||
|
|
||||||
|
|
||||||
|
def _page_count(pdf_path: str) -> int:
|
||||||
|
"""Return the number of pages in a PDF file."""
|
||||||
|
return len(PdfReader(pdf_path).pages)
|
||||||
|
|
||||||
|
|
||||||
|
def _page_text(pdf_path: str, page_index: int = 0) -> str:
|
||||||
|
"""Return extracted text from a single page (0-indexed)."""
|
||||||
|
reader = PdfReader(pdf_path)
|
||||||
|
return reader.pages[page_index].extract_text() or ""
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Fixture
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def multipage_pdf(tmp_path: Path) -> str:
|
||||||
|
"""3-page PDF saved to tmp, returns its string path."""
|
||||||
|
pdf_path = tmp_path / "source.pdf"
|
||||||
|
_create_multipage_pdf(pdf_path, ["Alpha", "Bravo", "Charlie"])
|
||||||
|
return str(pdf_path)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# extract_page_as_pdf
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestExtractPageAsPdf:
|
||||||
|
"""Tests for extract_page_as_pdf()."""
|
||||||
|
|
||||||
|
def test_valid_page_creates_single_page_pdf(self, tmp_path: Path, multipage_pdf: str):
|
||||||
|
"""Extracting page 2 produces a PDF with exactly 1 page."""
|
||||||
|
out = str(tmp_path / "out.pdf")
|
||||||
|
result = extract_page_as_pdf(multipage_pdf, 2, out)
|
||||||
|
assert result == out
|
||||||
|
assert _page_count(out) == 1
|
||||||
|
|
||||||
|
def test_page_content_matches_source(self, tmp_path: Path, multipage_pdf: str):
|
||||||
|
"""Extracted page 1 contains 'Alpha', page 2 contains 'Bravo', etc."""
|
||||||
|
out = str(tmp_path / "page1.pdf")
|
||||||
|
extract_page_as_pdf(multipage_pdf, 1, out)
|
||||||
|
assert "Alpha" in _page_text(out, 0)
|
||||||
|
|
||||||
|
out2 = str(tmp_path / "page2.pdf")
|
||||||
|
extract_page_as_pdf(multipage_pdf, 2, out2)
|
||||||
|
assert "Bravo" in _page_text(out2, 0)
|
||||||
|
|
||||||
|
out3 = str(tmp_path / "page3.pdf")
|
||||||
|
extract_page_as_pdf(multipage_pdf, 3, out3)
|
||||||
|
assert "Charlie" in _page_text(out3, 0)
|
||||||
|
|
||||||
|
def test_page_out_of_range_raises_value_error(self, tmp_path: Path, multipage_pdf: str):
|
||||||
|
"""Page number beyond PDF length raises ValueError."""
|
||||||
|
out = str(tmp_path / "never.pdf")
|
||||||
|
with pytest.raises(ValueError, match="out of range"):
|
||||||
|
extract_page_as_pdf(multipage_pdf, 99, out)
|
||||||
|
|
||||||
|
def test_page_zero_raises_value_error(self, tmp_path: Path, multipage_pdf: str):
|
||||||
|
"""Page number 0 (pages are 1-indexed) raises ValueError."""
|
||||||
|
out = str(tmp_path / "never.pdf")
|
||||||
|
with pytest.raises(ValueError, match="out of range"):
|
||||||
|
extract_page_as_pdf(multipage_pdf, 0, out)
|
||||||
|
|
||||||
|
def test_negative_page_raises_value_error(self, tmp_path: Path, multipage_pdf: str):
|
||||||
|
"""Negative page number raises ValueError."""
|
||||||
|
out = str(tmp_path / "never.pdf")
|
||||||
|
with pytest.raises(ValueError, match="out of range"):
|
||||||
|
extract_page_as_pdf(multipage_pdf, -1, out)
|
||||||
|
|
||||||
|
def test_nonexistent_source_raises_file_not_found(self, tmp_path: Path):
|
||||||
|
"""Missing source file raises FileNotFoundError."""
|
||||||
|
out = str(tmp_path / "out.pdf")
|
||||||
|
with pytest.raises(FileNotFoundError):
|
||||||
|
extract_page_as_pdf("/no/such/file.pdf", 1, out)
|
||||||
|
|
||||||
|
def test_invalid_pdf_raises_value_error(self, tmp_path: Path):
|
||||||
|
"""Corrupt / non-PDF content raises ValueError."""
|
||||||
|
bad = tmp_path / "bad.pdf"
|
||||||
|
bad.write_bytes(b"not a pdf")
|
||||||
|
out = str(tmp_path / "out.pdf")
|
||||||
|
with pytest.raises(ValueError, match="Invalid PDF"):
|
||||||
|
extract_page_as_pdf(str(bad), 1, out)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# extract_pages_as_pdf
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestExtractPagesAsPdf:
|
||||||
|
"""Tests for extract_pages_as_pdf()."""
|
||||||
|
|
||||||
|
def test_multiple_pages_creates_correct_files(
|
||||||
|
self, tmp_path: Path, multipage_pdf: str
|
||||||
|
):
|
||||||
|
"""Extracting pages [1, 3] from a 3-page PDF creates 2 files."""
|
||||||
|
out_dir = str(tmp_path / "pages")
|
||||||
|
result = extract_pages_as_pdf(multipage_pdf, [1, 3], out_dir, "doc")
|
||||||
|
assert len(result) == 2
|
||||||
|
assert _page_count(str(Path(out_dir) / result[0])) == 1
|
||||||
|
assert _page_count(str(Path(out_dir) / result[1])) == 1
|
||||||
|
|
||||||
|
def test_naming_convention(self, tmp_path: Path, multipage_pdf: str):
|
||||||
|
"""Output files follow {stem}_page_{n}.pdf naming."""
|
||||||
|
out_dir = str(tmp_path / "pages")
|
||||||
|
result = extract_pages_as_pdf(
|
||||||
|
multipage_pdf, [1, 3], out_dir, "NEC4 ACC"
|
||||||
|
)
|
||||||
|
assert "NEC4 ACC_page_1.pdf" in result
|
||||||
|
assert "NEC4 ACC_page_3.pdf" in result
|
||||||
|
|
||||||
|
def test_empty_page_numbers_returns_empty(self, tmp_path: Path, multipage_pdf: str):
|
||||||
|
"""Empty page_numbers list returns empty list and creates no files."""
|
||||||
|
out_dir = str(tmp_path / "pages")
|
||||||
|
result = extract_pages_as_pdf(multipage_pdf, [], out_dir, "doc")
|
||||||
|
assert result == []
|
||||||
|
# output dir may or may not be created for empty input — both are fine
|
||||||
|
|
||||||
|
def test_creates_output_dir_if_missing(self, tmp_path: Path, multipage_pdf: str):
|
||||||
|
"""Function creates output_dir when it does not yet exist."""
|
||||||
|
out_dir = str(tmp_path / "nested" / "deep" / "dir")
|
||||||
|
assert not Path(out_dir).exists()
|
||||||
|
result = extract_pages_as_pdf(multipage_pdf, [2], out_dir, "test")
|
||||||
|
assert Path(out_dir).is_dir()
|
||||||
|
assert len(result) == 1
|
||||||
|
|
||||||
|
def test_returns_relative_filenames(self, tmp_path: Path, multipage_pdf: str):
|
||||||
|
"""Returned paths are just filenames, not full paths."""
|
||||||
|
out_dir = str(tmp_path / "pages")
|
||||||
|
result = extract_pages_as_pdf(multipage_pdf, [1, 2], out_dir, "mydoc")
|
||||||
|
for name in result:
|
||||||
|
assert Path(name).name == name # no directory components
|
||||||
|
|
||||||
|
def test_out_of_range_page_raises_value_error(
|
||||||
|
self, tmp_path: Path, multipage_pdf: str
|
||||||
|
):
|
||||||
|
"""Any page number out of range raises ValueError."""
|
||||||
|
out_dir = str(tmp_path / "pages")
|
||||||
|
with pytest.raises(ValueError, match="out of range"):
|
||||||
|
extract_pages_as_pdf(multipage_pdf, [1, 50], out_dir, "doc")
|
||||||
|
|
||||||
|
def test_nonexistent_source_raises_file_not_found(self, tmp_path: Path):
|
||||||
|
"""Missing source file raises FileNotFoundError."""
|
||||||
|
out_dir = str(tmp_path / "pages")
|
||||||
|
with pytest.raises(FileNotFoundError):
|
||||||
|
extract_pages_as_pdf("/no/such/file.pdf", [1], out_dir, "doc")
|
||||||
|
|
||||||
|
def test_invalid_pdf_raises_value_error(self, tmp_path: Path):
|
||||||
|
"""Corrupt source raises ValueError."""
|
||||||
|
bad = tmp_path / "bad.pdf"
|
||||||
|
bad.write_bytes(b"garbage")
|
||||||
|
out_dir = str(tmp_path / "pages")
|
||||||
|
with pytest.raises(ValueError, match="Invalid PDF"):
|
||||||
|
extract_pages_as_pdf(str(bad), [1], out_dir, "doc")
|
||||||
|
|
||||||
|
def test_extracted_content_correct(self, tmp_path: Path, multipage_pdf: str):
|
||||||
|
"""Each extracted file contains the right page text."""
|
||||||
|
out_dir = str(tmp_path / "pages")
|
||||||
|
result = extract_pages_as_pdf(
|
||||||
|
multipage_pdf, [1, 3], out_dir, "content_test"
|
||||||
|
)
|
||||||
|
p1 = str(Path(out_dir) / result[0])
|
||||||
|
p3 = str(Path(out_dir) / result[1])
|
||||||
|
assert "Alpha" in _page_text(p1, 0)
|
||||||
|
assert "Charlie" in _page_text(p3, 0)
|
||||||
|
|
@ -0,0 +1,101 @@
|
||||||
|
import os
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from pypdf import PdfReader, PdfWriter
|
||||||
|
|
||||||
|
|
||||||
|
def extract_page_as_pdf(source_path: str, page_number: int, output_path: str) -> str:
|
||||||
|
"""Extract a single page from a PDF and save as a new PDF file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source_path: Path to original PDF file.
|
||||||
|
page_number: 1-indexed page number to extract.
|
||||||
|
output_path: Where to save the extracted page PDF.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The output_path of the saved PDF file.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: If source_path does not exist.
|
||||||
|
ValueError: If source is not a valid PDF or page_number is out of range.
|
||||||
|
"""
|
||||||
|
if not os.path.exists(source_path):
|
||||||
|
raise FileNotFoundError(f"Source file not found: {source_path}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
reader = PdfReader(source_path)
|
||||||
|
except Exception as exc:
|
||||||
|
raise ValueError(f"Invalid PDF file: {exc}") from exc
|
||||||
|
|
||||||
|
total = len(reader.pages)
|
||||||
|
if page_number < 1 or page_number > total:
|
||||||
|
raise ValueError(
|
||||||
|
f"Page number {page_number} out of range (1–{total})"
|
||||||
|
)
|
||||||
|
|
||||||
|
writer = PdfWriter()
|
||||||
|
writer.add_page(reader.pages[page_number - 1])
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
||||||
|
with open(output_path, "wb") as f:
|
||||||
|
writer.write(f)
|
||||||
|
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pages_as_pdf(
|
||||||
|
source_path: str,
|
||||||
|
page_numbers: List[int],
|
||||||
|
output_dir: str,
|
||||||
|
filename_stem: str,
|
||||||
|
) -> List[str]:
|
||||||
|
"""Extract multiple pages from a PDF, saving each as a separate PDF.
|
||||||
|
|
||||||
|
Naming convention: {filename_stem}_page_{page_number}.pdf
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source_path: Path to original PDF file.
|
||||||
|
page_numbers: List of 1-indexed page numbers to extract.
|
||||||
|
output_dir: Directory to save extracted PDFs.
|
||||||
|
filename_stem: Base name for output files (e.g. "NEC4 ACC").
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of output file paths (relative to output_dir).
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: If source_path does not exist.
|
||||||
|
ValueError: If source is not a valid PDF or any page_number is out of range.
|
||||||
|
"""
|
||||||
|
if not page_numbers:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if not os.path.exists(source_path):
|
||||||
|
raise FileNotFoundError(f"Source file not found: {source_path}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
reader = PdfReader(source_path)
|
||||||
|
except Exception as exc:
|
||||||
|
raise ValueError(f"Invalid PDF file: {exc}") from exc
|
||||||
|
|
||||||
|
total = len(reader.pages)
|
||||||
|
for pn in page_numbers:
|
||||||
|
if pn < 1 or pn > total:
|
||||||
|
raise ValueError(
|
||||||
|
f"Page number {pn} out of range (1–{total})"
|
||||||
|
)
|
||||||
|
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
output_filenames: List[str] = []
|
||||||
|
for pn in page_numbers:
|
||||||
|
filename = f"{filename_stem}_page_{pn}.pdf"
|
||||||
|
full_path = os.path.join(output_dir, filename)
|
||||||
|
|
||||||
|
writer = PdfWriter()
|
||||||
|
writer.add_page(reader.pages[pn - 1])
|
||||||
|
with open(full_path, "wb") as f:
|
||||||
|
writer.write(f)
|
||||||
|
|
||||||
|
output_filenames.append(filename)
|
||||||
|
|
||||||
|
return output_filenames
|
||||||
Loading…
Reference in New Issue