feat: rewrite DOCX parser with table extraction

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
Woody 2026-04-28 16:42:41 +08:00
parent 36fe1172a0
commit 136c25ae38
1 changed files with 35 additions and 5 deletions

View File

@ -6,12 +6,22 @@ try:
except Exception: except Exception:
Document = None Document = None
# WordprocessingML namespace
_WML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
def _extract_text_from_element(element, nsmap: dict) -> str:
"""Extract all text content from a paragraph or table cell element."""
texts = element.findall(f".//{{{_WML_NS}}}t", nsmap)
return "".join(t.text or "" for t in texts)
def parse_docx(file_path: str) -> str: def parse_docx(file_path: str) -> str:
"""Parse a DOCX file and return its text content. """Parse a DOCX file and return its text content.
The function preserves paragraph breaks by inserting a newline between Extracts text from both paragraphs and tables in document order,
paragraphs. Empty documents yield an empty string. preserving the original reading sequence. Table rows are joined
with `` | `` separators; paragraphs and rows are separated by newlines.
Raises: Raises:
ValueError: If the file is not a valid DOCX document or cannot be read. ValueError: If the file is not a valid DOCX document or cannot be read.
@ -23,6 +33,26 @@ def parse_docx(file_path: str) -> str:
except Exception as exc: # pragma: no cover - surface invalid DOCX except Exception as exc: # pragma: no cover - surface invalid DOCX
raise ValueError(f"Invalid DOCX file: {exc}") from exc raise ValueError(f"Invalid DOCX file: {exc}") from exc
paragraphs = [para.text for para in doc.paragraphs if para.text is not None] parts: list[str] = []
# Join with newline to preserve paragraph breaks
return "\n".join(paragraphs).strip() for element in doc.element.body:
tag = element.tag
if tag == f"{{{_WML_NS}}}p":
# Paragraph extract all run text
text = _extract_text_from_element(element, {})
parts.append(text)
elif tag == f"{{{_WML_NS}}}tbl":
# Table extract rows and cells in order
nsmap = {"w": _WML_NS}
for row in element.findall("w:tr", nsmap):
row_texts: list[str] = []
for cell in row.findall("w:tc", nsmap):
cell_text = _extract_text_from_element(cell, nsmap).strip()
if cell_text:
row_texts.append(cell_text)
if row_texts:
parts.append(" | ".join(row_texts))
return "\n".join(parts).strip()