legco_ai_assistant/backend/app/utils/docx_parser.py

59 lines
2.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from __future__ import annotations
try:
from docx import Document as _Doc
Document = _Doc
except Exception:
Document = None
# WordprocessingML namespace
_WML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
def _extract_text_from_element(element, nsmap: dict) -> str:
"""Extract all text content from a paragraph or table cell element."""
texts = element.findall(f".//{{{_WML_NS}}}t", nsmap)
return "".join(t.text or "" for t in texts)
def parse_docx(file_path: str) -> str:
"""Parse a DOCX file and return its text content.
Extracts text from both paragraphs and tables in document order,
preserving the original reading sequence. Table rows are joined
with `` | `` separators; paragraphs and rows are separated by newlines.
Raises:
ValueError: If the file is not a valid DOCX document or cannot be read.
"""
if Document is None:
raise ValueError("DOCX library is not installed")
try:
doc = Document(file_path)
except Exception as exc: # pragma: no cover - surface invalid DOCX
raise ValueError(f"Invalid DOCX file: {exc}") from exc
parts: list[str] = []
for element in doc.element.body:
tag = element.tag
if tag == f"{{{_WML_NS}}}p":
# Paragraph extract all run text
text = _extract_text_from_element(element, {})
parts.append(text)
elif tag == f"{{{_WML_NS}}}tbl":
# Table extract rows and cells in order
nsmap = {"w": _WML_NS}
for row in element.findall("w:tr", nsmap):
row_texts: list[str] = []
for cell in row.findall("w:tc", nsmap):
cell_text = _extract_text_from_element(cell, nsmap).strip()
if cell_text:
row_texts.append(cell_text)
if row_texts:
parts.append(" | ".join(row_texts))
return "\n".join(parts).strip()