feat: rewrite DOCX parser with table extraction

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-04-28 16:42:41 +08:00 · 2026-04-28 16:42:41 +08:00 · 136c25ae38
parent 36fe1172a0
commit 136c25ae38
1 changed files with 35 additions and 5 deletions
--- a/backend/app/utils/docx_parser.py
+++ b/backend/app/utils/docx_parser.py
@ -6,12 +6,22 @@ try:
 except Exception:
    Document = None
 # WordprocessingML namespace
 _WML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
 def _extract_text_from_element(element, nsmap: dict) -> str:
    """Extract all text content from a paragraph or table cell element."""
    texts = element.findall(f".//{{{_WML_NS}}}t", nsmap)
    return "".join(t.text or "" for t in texts)
 def parse_docx(file_path: str) -> str:
    """Parse a DOCX file and return its text content.
-    The function preserves paragraph breaks by inserting a newline between
+    Extracts text from both paragraphs and tables in document order,
-    paragraphs. Empty documents yield an empty string.
+    preserving the original reading sequence. Table rows are joined
    with `` | `` separators; paragraphs and rows are separated by newlines.
    Raises:
        ValueError: If the file is not a valid DOCX document or cannot be read.
@ -23,6 +33,26 @@ def parse_docx(file_path: str) -> str:
    except Exception as exc:  # pragma: no cover - surface invalid DOCX
        raise ValueError(f"Invalid DOCX file: {exc}") from exc
-    paragraphs = [para.text for para in doc.paragraphs if para.text is not None]
+    parts: list[str] = []
-    # Join with newline to preserve paragraph breaks
+
-    return "\n".join(paragraphs).strip()
+    for element in doc.element.body:
        tag = element.tag
        if tag == f"{{{_WML_NS}}}p":
            # Paragraph – extract all run text
            text = _extract_text_from_element(element, {})
            parts.append(text)
        elif tag == f"{{{_WML_NS}}}tbl":
            # Table – extract rows and cells in order
            nsmap = {"w": _WML_NS}
            for row in element.findall("w:tr", nsmap):
                row_texts: list[str] = []
                for cell in row.findall("w:tc", nsmap):
                    cell_text = _extract_text_from_element(cell, nsmap).strip()
                    if cell_text:
                        row_texts.append(cell_text)
                if row_texts:
                    parts.append(" | ".join(row_texts))
    return "\n".join(parts).strip()