from __future__ import annotations from typing import Optional Document = None def _ensure_docx_imported(): global Document if Document is None: try: from docx import Document as _Doc # type: ignore Document = _Doc except Exception as exc: # pragma: no cover - missing optional dep raise ValueError("DOCX library is not installed") from exc def parse_docx(file_path: str) -> str: """Parse a DOCX file and return its text content. The function preserves paragraph breaks by inserting a newline between paragraphs. Empty documents yield an empty string. Raises: ValueError: If the file is not a valid DOCX document or cannot be read. """ try: _ensure_docx_imported() assert Document is not None doc = Document(file_path) except Exception as exc: # pragma: no cover - surface invalid DOCX raise ValueError(f"Invalid DOCX file: {exc}") from exc paragraphs = [para.text for para in doc.paragraphs if para.text is not None] # Join with newline to preserve paragraph breaks return "\n".join(paragraphs).strip()