36 lines
1.1 KiB
Python
36 lines
1.1 KiB
Python
from __future__ import annotations
|
|
|
|
from typing import Optional
|
|
Document = None
|
|
|
|
|
|
def _ensure_docx_imported():
|
|
global Document
|
|
if Document is None:
|
|
try:
|
|
from docx import Document as _Doc # type: ignore
|
|
Document = _Doc
|
|
except Exception as exc: # pragma: no cover - missing optional dep
|
|
raise ValueError("DOCX library is not installed") from exc
|
|
|
|
|
|
def parse_docx(file_path: str) -> str:
|
|
"""Parse a DOCX file and return its text content.
|
|
|
|
The function preserves paragraph breaks by inserting a newline between
|
|
paragraphs. Empty documents yield an empty string.
|
|
|
|
Raises:
|
|
ValueError: If the file is not a valid DOCX document or cannot be read.
|
|
"""
|
|
try:
|
|
_ensure_docx_imported()
|
|
assert Document is not None
|
|
doc = Document(file_path)
|
|
except Exception as exc: # pragma: no cover - surface invalid DOCX
|
|
raise ValueError(f"Invalid DOCX file: {exc}") from exc
|
|
|
|
paragraphs = [para.text for para in doc.paragraphs if para.text is not None]
|
|
# Join with newline to preserve paragraph breaks
|
|
return "\n".join(paragraphs).strip()
|