59 lines
2.0 KiB
Python
59 lines
2.0 KiB
Python
from __future__ import annotations
|
||
|
||
try:
|
||
from docx import Document as _Doc
|
||
Document = _Doc
|
||
except Exception:
|
||
Document = None
|
||
|
||
# WordprocessingML namespace
|
||
_WML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||
|
||
|
||
def _extract_text_from_element(element, nsmap: dict) -> str:
|
||
"""Extract all text content from a paragraph or table cell element."""
|
||
texts = element.findall(f".//{{{_WML_NS}}}t", nsmap)
|
||
return "".join(t.text or "" for t in texts)
|
||
|
||
|
||
def parse_docx(file_path: str) -> str:
|
||
"""Parse a DOCX file and return its text content.
|
||
|
||
Extracts text from both paragraphs and tables in document order,
|
||
preserving the original reading sequence. Table rows are joined
|
||
with `` | `` separators; paragraphs and rows are separated by newlines.
|
||
|
||
Raises:
|
||
ValueError: If the file is not a valid DOCX document or cannot be read.
|
||
"""
|
||
if Document is None:
|
||
raise ValueError("DOCX library is not installed")
|
||
try:
|
||
doc = Document(file_path)
|
||
except Exception as exc: # pragma: no cover - surface invalid DOCX
|
||
raise ValueError(f"Invalid DOCX file: {exc}") from exc
|
||
|
||
parts: list[str] = []
|
||
|
||
for element in doc.element.body:
|
||
tag = element.tag
|
||
|
||
if tag == f"{{{_WML_NS}}}p":
|
||
# Paragraph – extract all run text
|
||
text = _extract_text_from_element(element, {})
|
||
parts.append(text)
|
||
|
||
elif tag == f"{{{_WML_NS}}}tbl":
|
||
# Table – extract rows and cells in order
|
||
nsmap = {"w": _WML_NS}
|
||
for row in element.findall("w:tr", nsmap):
|
||
row_texts: list[str] = []
|
||
for cell in row.findall("w:tc", nsmap):
|
||
cell_text = _extract_text_from_element(cell, nsmap).strip()
|
||
if cell_text:
|
||
row_texts.append(cell_text)
|
||
if row_texts:
|
||
parts.append(" | ".join(row_texts))
|
||
|
||
return "\n".join(parts).strip()
|