from typing import List, Tuple

from pypdf import PdfReader


def parse_pdf(file_path: str) -> str:
    """Parse a PDF file and return its text content.

    Text is collected from each page and concatenated with newlines between pages.
    Empty PDFs or corrupted files raise ValueError.
    """
    try:
        reader = PdfReader(file_path)
    except Exception as exc:
        raise ValueError(f"Invalid PDF file: {exc}") from exc

    texts = []
    try:
        for page in reader.pages:
            text = page.extract_text()
            if text:
                # Normalize line endings and trim unrelated whitespace
                texts.append(text.strip())
    except Exception as exc:
        raise ValueError(f"Failed to extract text from PDF: {exc}") from exc

    return "\n".join(texts).strip()


def parse_pdf_by_page(file_path: str) -> List[Tuple[int, str]]:
    """Parse PDF and return per-page text with page numbers (1-indexed).

    Returns list of (page_number, page_text) tuples. Pages with no
    extractable text are skipped.

    Raises:
        ValueError: If file is not a valid PDF.
    """
    try:
        reader = PdfReader(file_path)
    except Exception as exc:
        raise ValueError(f"Invalid PDF file: {exc}") from exc

    pages: List[Tuple[int, str]] = []
    try:
        for page_number, page in enumerate(reader.pages, start=1):
            text = page.extract_text()
            if text and text.strip():
                pages.append((page_number, text.strip()))
    except Exception as exc:
        raise ValueError(f"Failed to extract text from PDF: {exc}") from exc

    return pages