legco_ai_assistant/backend/app/utils/docx_parser.py

from __future__ import annotations

try:
    from docx import Document as _Doc
    Document = _Doc
except Exception:
    Document = None

# WordprocessingML namespace
_WML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"


def _extract_text_from_element(element, nsmap: dict) -> str:
    """Extract all text content from a paragraph or table cell element."""
    texts = element.findall(f".//{{{_WML_NS}}}t", nsmap)
    return "".join(t.text or "" for t in texts)


def parse_docx(file_path: str) -> str:
    """Parse a DOCX file and return its text content.

    Extracts text from both paragraphs and tables in document order,
    preserving the original reading sequence. Table rows are joined
    with `` | `` separators; paragraphs and rows are separated by newlines.

    Raises:
        ValueError: If the file is not a valid DOCX document or cannot be read.
    """
    if Document is None:
        raise ValueError("DOCX library is not installed")
    try:
        doc = Document(file_path)
    except Exception as exc:  # pragma: no cover - surface invalid DOCX
        raise ValueError(f"Invalid DOCX file: {exc}") from exc

    parts: list[str] = []

    for element in doc.element.body:
        tag = element.tag

        if tag == f"{{{_WML_NS}}}p":
            # Paragraph – extract all run text
            text = _extract_text_from_element(element, {})
            parts.append(text)

        elif tag == f"{{{_WML_NS}}}tbl":
            # Table – extract rows and cells in order
            nsmap = {"w": _WML_NS}
            for row in element.findall("w:tr", nsmap):
                row_texts: list[str] = []
                for cell in row.findall("w:tc", nsmap):
                    cell_text = _extract_text_from_element(cell, nsmap).strip()
                    if cell_text:
                        row_texts.append(cell_text)
                if row_texts:
                    parts.append(" | ".join(row_texts))

    return "\n".join(parts).strip()