legco_ai_assistant/backend/app/utils/docx_parser.py

29 lines
901 B
Python

from __future__ import annotations
try:
from docx import Document as _Doc
Document = _Doc
except Exception:
Document = None
def parse_docx(file_path: str) -> str:
"""Parse a DOCX file and return its text content.
The function preserves paragraph breaks by inserting a newline between
paragraphs. Empty documents yield an empty string.
Raises:
ValueError: If the file is not a valid DOCX document or cannot be read.
"""
if Document is None:
raise ValueError("DOCX library is not installed")
try:
doc = Document(file_path)
except Exception as exc: # pragma: no cover - surface invalid DOCX
raise ValueError(f"Invalid DOCX file: {exc}") from exc
paragraphs = [para.text for para in doc.paragraphs if para.text is not None]
# Join with newline to preserve paragraph breaks
return "\n".join(paragraphs).strip()