from __future__ import annotations try: from docx import Document as _Doc Document = _Doc except Exception: Document = None # WordprocessingML namespace _WML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" def _extract_text_from_element(element, nsmap: dict) -> str: """Extract all text content from a paragraph or table cell element.""" texts = element.findall(f".//{{{_WML_NS}}}t", nsmap) return "".join(t.text or "" for t in texts) def parse_docx(file_path: str) -> str: """Parse a DOCX file and return its text content. Extracts text from both paragraphs and tables in document order, preserving the original reading sequence. Table rows are joined with `` | `` separators; paragraphs and rows are separated by newlines. Raises: ValueError: If the file is not a valid DOCX document or cannot be read. """ if Document is None: raise ValueError("DOCX library is not installed") try: doc = Document(file_path) except Exception as exc: # pragma: no cover - surface invalid DOCX raise ValueError(f"Invalid DOCX file: {exc}") from exc parts: list[str] = [] for element in doc.element.body: tag = element.tag if tag == f"{{{_WML_NS}}}p": # Paragraph – extract all run text text = _extract_text_from_element(element, {}) parts.append(text) elif tag == f"{{{_WML_NS}}}tbl": # Table – extract rows and cells in order nsmap = {"w": _WML_NS} for row in element.findall("w:tr", nsmap): row_texts: list[str] = [] for cell in row.findall("w:tc", nsmap): cell_text = _extract_text_from_element(cell, nsmap).strip() if cell_text: row_texts.append(cell_text) if row_texts: parts.append(" | ".join(row_texts)) return "\n".join(parts).strip()