from typing import List, Tuple from pypdf import PdfReader def parse_pdf(file_path: str) -> str: """Parse a PDF file and return its text content. Text is collected from each page and concatenated with newlines between pages. Empty PDFs or corrupted files raise ValueError. """ try: reader = PdfReader(file_path) except Exception as exc: raise ValueError(f"Invalid PDF file: {exc}") from exc texts = [] try: for page in reader.pages: text = page.extract_text() if text: # Normalize line endings and trim unrelated whitespace texts.append(text.strip()) except Exception as exc: raise ValueError(f"Failed to extract text from PDF: {exc}") from exc return "\n".join(texts).strip() def parse_pdf_by_page(file_path: str) -> List[Tuple[int, str]]: """Parse PDF and return per-page text with page numbers (1-indexed). Returns list of (page_number, page_text) tuples. Pages with no extractable text are skipped. Raises: ValueError: If file is not a valid PDF. """ try: reader = PdfReader(file_path) except Exception as exc: raise ValueError(f"Invalid PDF file: {exc}") from exc pages: List[Tuple[int, str]] = [] try: for page_number, page in enumerate(reader.pages, start=1): text = page.extract_text() if text and text.strip(): pages.append((page_number, text.strip())) except Exception as exc: raise ValueError(f"Failed to extract text from PDF: {exc}") from exc return pages