54 lines
1.6 KiB
Python
54 lines
1.6 KiB
Python
from typing import List, Tuple
|
|
|
|
from pypdf import PdfReader
|
|
|
|
|
|
def parse_pdf(file_path: str) -> str:
|
|
"""Parse a PDF file and return its text content.
|
|
|
|
Text is collected from each page and concatenated with newlines between pages.
|
|
Empty PDFs or corrupted files raise ValueError.
|
|
"""
|
|
try:
|
|
reader = PdfReader(file_path)
|
|
except Exception as exc:
|
|
raise ValueError(f"Invalid PDF file: {exc}") from exc
|
|
|
|
texts = []
|
|
try:
|
|
for page in reader.pages:
|
|
text = page.extract_text()
|
|
if text:
|
|
# Normalize line endings and trim unrelated whitespace
|
|
texts.append(text.strip())
|
|
except Exception as exc:
|
|
raise ValueError(f"Failed to extract text from PDF: {exc}") from exc
|
|
|
|
return "\n".join(texts).strip()
|
|
|
|
|
|
def parse_pdf_by_page(file_path: str) -> List[Tuple[int, str]]:
|
|
"""Parse PDF and return per-page text with page numbers (1-indexed).
|
|
|
|
Returns list of (page_number, page_text) tuples. Pages with no
|
|
extractable text are skipped.
|
|
|
|
Raises:
|
|
ValueError: If file is not a valid PDF.
|
|
"""
|
|
try:
|
|
reader = PdfReader(file_path)
|
|
except Exception as exc:
|
|
raise ValueError(f"Invalid PDF file: {exc}") from exc
|
|
|
|
pages: List[Tuple[int, str]] = []
|
|
try:
|
|
for page_number, page in enumerate(reader.pages, start=1):
|
|
text = page.extract_text()
|
|
if text and text.strip():
|
|
pages.append((page_number, text.strip()))
|
|
except Exception as exc:
|
|
raise ValueError(f"Failed to extract text from PDF: {exc}") from exc
|
|
|
|
return pages
|