legco_ai_assistant/backend/app/utils/pdf_extractor.py

102 lines
2.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
from typing import List
from pypdf import PdfReader, PdfWriter
def extract_page_as_pdf(source_path: str, page_number: int, output_path: str) -> str:
"""Extract a single page from a PDF and save as a new PDF file.
Args:
source_path: Path to original PDF file.
page_number: 1-indexed page number to extract.
output_path: Where to save the extracted page PDF.
Returns:
The output_path of the saved PDF file.
Raises:
FileNotFoundError: If source_path does not exist.
ValueError: If source is not a valid PDF or page_number is out of range.
"""
if not os.path.exists(source_path):
raise FileNotFoundError(f"Source file not found: {source_path}")
try:
reader = PdfReader(source_path)
except Exception as exc:
raise ValueError(f"Invalid PDF file: {exc}") from exc
total = len(reader.pages)
if page_number < 1 or page_number > total:
raise ValueError(
f"Page number {page_number} out of range (1{total})"
)
writer = PdfWriter()
writer.add_page(reader.pages[page_number - 1])
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
with open(output_path, "wb") as f:
writer.write(f)
return output_path
def extract_pages_as_pdf(
source_path: str,
page_numbers: List[int],
output_dir: str,
filename_stem: str,
) -> List[str]:
"""Extract multiple pages from a PDF, saving each as a separate PDF.
Naming convention: {filename_stem}_page_{page_number}.pdf
Args:
source_path: Path to original PDF file.
page_numbers: List of 1-indexed page numbers to extract.
output_dir: Directory to save extracted PDFs.
filename_stem: Base name for output files (e.g. "NEC4 ACC").
Returns:
List of output file paths (relative to output_dir).
Raises:
FileNotFoundError: If source_path does not exist.
ValueError: If source is not a valid PDF or any page_number is out of range.
"""
if not page_numbers:
return []
if not os.path.exists(source_path):
raise FileNotFoundError(f"Source file not found: {source_path}")
try:
reader = PdfReader(source_path)
except Exception as exc:
raise ValueError(f"Invalid PDF file: {exc}") from exc
total = len(reader.pages)
for pn in page_numbers:
if pn < 1 or pn > total:
raise ValueError(
f"Page number {pn} out of range (1{total})"
)
os.makedirs(output_dir, exist_ok=True)
output_filenames: List[str] = []
for pn in page_numbers:
filename = f"{filename_stem}_page_{pn}.pdf"
full_path = os.path.join(output_dir, filename)
writer = PdfWriter()
writer.add_page(reader.pages[pn - 1])
with open(full_path, "wb") as f:
writer.write(f)
output_filenames.append(filename)
return output_filenames