diff --git a/.gitignore b/.gitignore index 552f6ed..5711281 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,9 @@ Thumbs.db # ChromaDB chroma_db/ +# Chunk PDF storage +document_chunk/ + # Backend logs backend/app/log/ *.log diff --git a/backend/app/core/config.py b/backend/app/core/config.py index a4e7019..e4fc90a 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -19,6 +19,9 @@ class Settings(BaseSettings): # ChromaDB chroma_db_path: str = "./chroma_db" + # Chunk PDF storage (extracted PDF pages) + document_chunk_path: str = "./document_chunk" + # App configuration moved to settings for easier testing/configuration # Cross-origin settings and chunking parameters (Phase 1 plan) cors_origins: list[str] = ["http://localhost:5173", "http://localhost:3000"] diff --git a/backend/app/test/test_phase1_pdf_extractor.py b/backend/app/test/test_phase1_pdf_extractor.py new file mode 100644 index 0000000..f2163cb --- /dev/null +++ b/backend/app/test/test_phase1_pdf_extractor.py @@ -0,0 +1,197 @@ +"""Phase 1.5.5a: PDF page extractor tests. + +Tests for extract_page_as_pdf() and extract_pages_as_pdf() which extract +individual pages from a PDF and save them as separate single-page PDF files. + +These extracted "chunk PDFs" serve as clickable source references in RAG responses. +""" +from io import BytesIO +from pathlib import Path + +import pytest +from pypdf import PdfReader + +from app.utils.pdf_extractor import extract_page_as_pdf, extract_pages_as_pdf + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _create_multipage_pdf(path: Path, page_texts: list[str]) -> None: + """Create a multi-page PDF at *path* with the given text per page.""" + from reportlab.pdfgen import canvas + + buf = BytesIO() + c = canvas.Canvas(buf) + for text in page_texts: + c.drawString(72, 720, text) + c.showPage() + c.save() + path.write_bytes(buf.getvalue()) + + +def _page_count(pdf_path: str) -> int: + """Return the number of pages in a PDF file.""" + return len(PdfReader(pdf_path).pages) + + +def _page_text(pdf_path: str, page_index: int = 0) -> str: + """Return extracted text from a single page (0-indexed).""" + reader = PdfReader(pdf_path) + return reader.pages[page_index].extract_text() or "" + + +# --------------------------------------------------------------------------- +# Fixture +# --------------------------------------------------------------------------- + +@pytest.fixture() +def multipage_pdf(tmp_path: Path) -> str: + """3-page PDF saved to tmp, returns its string path.""" + pdf_path = tmp_path / "source.pdf" + _create_multipage_pdf(pdf_path, ["Alpha", "Bravo", "Charlie"]) + return str(pdf_path) + + +# --------------------------------------------------------------------------- +# extract_page_as_pdf +# --------------------------------------------------------------------------- + +class TestExtractPageAsPdf: + """Tests for extract_page_as_pdf().""" + + def test_valid_page_creates_single_page_pdf(self, tmp_path: Path, multipage_pdf: str): + """Extracting page 2 produces a PDF with exactly 1 page.""" + out = str(tmp_path / "out.pdf") + result = extract_page_as_pdf(multipage_pdf, 2, out) + assert result == out + assert _page_count(out) == 1 + + def test_page_content_matches_source(self, tmp_path: Path, multipage_pdf: str): + """Extracted page 1 contains 'Alpha', page 2 contains 'Bravo', etc.""" + out = str(tmp_path / "page1.pdf") + extract_page_as_pdf(multipage_pdf, 1, out) + assert "Alpha" in _page_text(out, 0) + + out2 = str(tmp_path / "page2.pdf") + extract_page_as_pdf(multipage_pdf, 2, out2) + assert "Bravo" in _page_text(out2, 0) + + out3 = str(tmp_path / "page3.pdf") + extract_page_as_pdf(multipage_pdf, 3, out3) + assert "Charlie" in _page_text(out3, 0) + + def test_page_out_of_range_raises_value_error(self, tmp_path: Path, multipage_pdf: str): + """Page number beyond PDF length raises ValueError.""" + out = str(tmp_path / "never.pdf") + with pytest.raises(ValueError, match="out of range"): + extract_page_as_pdf(multipage_pdf, 99, out) + + def test_page_zero_raises_value_error(self, tmp_path: Path, multipage_pdf: str): + """Page number 0 (pages are 1-indexed) raises ValueError.""" + out = str(tmp_path / "never.pdf") + with pytest.raises(ValueError, match="out of range"): + extract_page_as_pdf(multipage_pdf, 0, out) + + def test_negative_page_raises_value_error(self, tmp_path: Path, multipage_pdf: str): + """Negative page number raises ValueError.""" + out = str(tmp_path / "never.pdf") + with pytest.raises(ValueError, match="out of range"): + extract_page_as_pdf(multipage_pdf, -1, out) + + def test_nonexistent_source_raises_file_not_found(self, tmp_path: Path): + """Missing source file raises FileNotFoundError.""" + out = str(tmp_path / "out.pdf") + with pytest.raises(FileNotFoundError): + extract_page_as_pdf("/no/such/file.pdf", 1, out) + + def test_invalid_pdf_raises_value_error(self, tmp_path: Path): + """Corrupt / non-PDF content raises ValueError.""" + bad = tmp_path / "bad.pdf" + bad.write_bytes(b"not a pdf") + out = str(tmp_path / "out.pdf") + with pytest.raises(ValueError, match="Invalid PDF"): + extract_page_as_pdf(str(bad), 1, out) + + +# --------------------------------------------------------------------------- +# extract_pages_as_pdf +# --------------------------------------------------------------------------- + +class TestExtractPagesAsPdf: + """Tests for extract_pages_as_pdf().""" + + def test_multiple_pages_creates_correct_files( + self, tmp_path: Path, multipage_pdf: str + ): + """Extracting pages [1, 3] from a 3-page PDF creates 2 files.""" + out_dir = str(tmp_path / "pages") + result = extract_pages_as_pdf(multipage_pdf, [1, 3], out_dir, "doc") + assert len(result) == 2 + assert _page_count(str(Path(out_dir) / result[0])) == 1 + assert _page_count(str(Path(out_dir) / result[1])) == 1 + + def test_naming_convention(self, tmp_path: Path, multipage_pdf: str): + """Output files follow {stem}_page_{n}.pdf naming.""" + out_dir = str(tmp_path / "pages") + result = extract_pages_as_pdf( + multipage_pdf, [1, 3], out_dir, "NEC4 ACC" + ) + assert "NEC4 ACC_page_1.pdf" in result + assert "NEC4 ACC_page_3.pdf" in result + + def test_empty_page_numbers_returns_empty(self, tmp_path: Path, multipage_pdf: str): + """Empty page_numbers list returns empty list and creates no files.""" + out_dir = str(tmp_path / "pages") + result = extract_pages_as_pdf(multipage_pdf, [], out_dir, "doc") + assert result == [] + # output dir may or may not be created for empty input — both are fine + + def test_creates_output_dir_if_missing(self, tmp_path: Path, multipage_pdf: str): + """Function creates output_dir when it does not yet exist.""" + out_dir = str(tmp_path / "nested" / "deep" / "dir") + assert not Path(out_dir).exists() + result = extract_pages_as_pdf(multipage_pdf, [2], out_dir, "test") + assert Path(out_dir).is_dir() + assert len(result) == 1 + + def test_returns_relative_filenames(self, tmp_path: Path, multipage_pdf: str): + """Returned paths are just filenames, not full paths.""" + out_dir = str(tmp_path / "pages") + result = extract_pages_as_pdf(multipage_pdf, [1, 2], out_dir, "mydoc") + for name in result: + assert Path(name).name == name # no directory components + + def test_out_of_range_page_raises_value_error( + self, tmp_path: Path, multipage_pdf: str + ): + """Any page number out of range raises ValueError.""" + out_dir = str(tmp_path / "pages") + with pytest.raises(ValueError, match="out of range"): + extract_pages_as_pdf(multipage_pdf, [1, 50], out_dir, "doc") + + def test_nonexistent_source_raises_file_not_found(self, tmp_path: Path): + """Missing source file raises FileNotFoundError.""" + out_dir = str(tmp_path / "pages") + with pytest.raises(FileNotFoundError): + extract_pages_as_pdf("/no/such/file.pdf", [1], out_dir, "doc") + + def test_invalid_pdf_raises_value_error(self, tmp_path: Path): + """Corrupt source raises ValueError.""" + bad = tmp_path / "bad.pdf" + bad.write_bytes(b"garbage") + out_dir = str(tmp_path / "pages") + with pytest.raises(ValueError, match="Invalid PDF"): + extract_pages_as_pdf(str(bad), [1], out_dir, "doc") + + def test_extracted_content_correct(self, tmp_path: Path, multipage_pdf: str): + """Each extracted file contains the right page text.""" + out_dir = str(tmp_path / "pages") + result = extract_pages_as_pdf( + multipage_pdf, [1, 3], out_dir, "content_test" + ) + p1 = str(Path(out_dir) / result[0]) + p3 = str(Path(out_dir) / result[1]) + assert "Alpha" in _page_text(p1, 0) + assert "Charlie" in _page_text(p3, 0) diff --git a/backend/app/utils/pdf_extractor.py b/backend/app/utils/pdf_extractor.py new file mode 100644 index 0000000..54c06ed --- /dev/null +++ b/backend/app/utils/pdf_extractor.py @@ -0,0 +1,101 @@ +import os +from typing import List + +from pypdf import PdfReader, PdfWriter + + +def extract_page_as_pdf(source_path: str, page_number: int, output_path: str) -> str: + """Extract a single page from a PDF and save as a new PDF file. + + Args: + source_path: Path to original PDF file. + page_number: 1-indexed page number to extract. + output_path: Where to save the extracted page PDF. + + Returns: + The output_path of the saved PDF file. + + Raises: + FileNotFoundError: If source_path does not exist. + ValueError: If source is not a valid PDF or page_number is out of range. + """ + if not os.path.exists(source_path): + raise FileNotFoundError(f"Source file not found: {source_path}") + + try: + reader = PdfReader(source_path) + except Exception as exc: + raise ValueError(f"Invalid PDF file: {exc}") from exc + + total = len(reader.pages) + if page_number < 1 or page_number > total: + raise ValueError( + f"Page number {page_number} out of range (1–{total})" + ) + + writer = PdfWriter() + writer.add_page(reader.pages[page_number - 1]) + + os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) + with open(output_path, "wb") as f: + writer.write(f) + + return output_path + + +def extract_pages_as_pdf( + source_path: str, + page_numbers: List[int], + output_dir: str, + filename_stem: str, +) -> List[str]: + """Extract multiple pages from a PDF, saving each as a separate PDF. + + Naming convention: {filename_stem}_page_{page_number}.pdf + + Args: + source_path: Path to original PDF file. + page_numbers: List of 1-indexed page numbers to extract. + output_dir: Directory to save extracted PDFs. + filename_stem: Base name for output files (e.g. "NEC4 ACC"). + + Returns: + List of output file paths (relative to output_dir). + + Raises: + FileNotFoundError: If source_path does not exist. + ValueError: If source is not a valid PDF or any page_number is out of range. + """ + if not page_numbers: + return [] + + if not os.path.exists(source_path): + raise FileNotFoundError(f"Source file not found: {source_path}") + + try: + reader = PdfReader(source_path) + except Exception as exc: + raise ValueError(f"Invalid PDF file: {exc}") from exc + + total = len(reader.pages) + for pn in page_numbers: + if pn < 1 or pn > total: + raise ValueError( + f"Page number {pn} out of range (1–{total})" + ) + + os.makedirs(output_dir, exist_ok=True) + + output_filenames: List[str] = [] + for pn in page_numbers: + filename = f"{filename_stem}_page_{pn}.pdf" + full_path = os.path.join(output_dir, filename) + + writer = PdfWriter() + writer.add_page(reader.pages[pn - 1]) + with open(full_path, "wb") as f: + writer.write(f) + + output_filenames.append(filename) + + return output_filenames