legco_ai_assistant/backend/app/utils/pdf_parser.py

26 lines
779 B
Python

from pypdf import PdfReader
def parse_pdf(file_path: str) -> str:
"""Parse a PDF file and return its text content.
Text is collected from each page and concatenated with newlines between pages.
Empty PDFs or corrupted files raise ValueError.
"""
try:
reader = PdfReader(file_path)
except Exception as exc:
raise ValueError(f"Invalid PDF file: {exc}") from exc
texts = []
try:
for page in reader.pages:
text = page.extract_text()
if text:
# Normalize line endings and trim unrelated whitespace
texts.append(text.strip())
except Exception as exc:
raise ValueError(f"Failed to extract text from PDF: {exc}") from exc
return "\n".join(texts).strip()