feat: rewrite DOCX parser with table extraction
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
36fe1172a0
commit
136c25ae38
|
|
@ -6,12 +6,22 @@ try:
|
||||||
except Exception:
|
except Exception:
|
||||||
Document = None
|
Document = None
|
||||||
|
|
||||||
|
# WordprocessingML namespace
|
||||||
|
_WML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_text_from_element(element, nsmap: dict) -> str:
|
||||||
|
"""Extract all text content from a paragraph or table cell element."""
|
||||||
|
texts = element.findall(f".//{{{_WML_NS}}}t", nsmap)
|
||||||
|
return "".join(t.text or "" for t in texts)
|
||||||
|
|
||||||
|
|
||||||
def parse_docx(file_path: str) -> str:
|
def parse_docx(file_path: str) -> str:
|
||||||
"""Parse a DOCX file and return its text content.
|
"""Parse a DOCX file and return its text content.
|
||||||
|
|
||||||
The function preserves paragraph breaks by inserting a newline between
|
Extracts text from both paragraphs and tables in document order,
|
||||||
paragraphs. Empty documents yield an empty string.
|
preserving the original reading sequence. Table rows are joined
|
||||||
|
with `` | `` separators; paragraphs and rows are separated by newlines.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If the file is not a valid DOCX document or cannot be read.
|
ValueError: If the file is not a valid DOCX document or cannot be read.
|
||||||
|
|
@ -23,6 +33,26 @@ def parse_docx(file_path: str) -> str:
|
||||||
except Exception as exc: # pragma: no cover - surface invalid DOCX
|
except Exception as exc: # pragma: no cover - surface invalid DOCX
|
||||||
raise ValueError(f"Invalid DOCX file: {exc}") from exc
|
raise ValueError(f"Invalid DOCX file: {exc}") from exc
|
||||||
|
|
||||||
paragraphs = [para.text for para in doc.paragraphs if para.text is not None]
|
parts: list[str] = []
|
||||||
# Join with newline to preserve paragraph breaks
|
|
||||||
return "\n".join(paragraphs).strip()
|
for element in doc.element.body:
|
||||||
|
tag = element.tag
|
||||||
|
|
||||||
|
if tag == f"{{{_WML_NS}}}p":
|
||||||
|
# Paragraph – extract all run text
|
||||||
|
text = _extract_text_from_element(element, {})
|
||||||
|
parts.append(text)
|
||||||
|
|
||||||
|
elif tag == f"{{{_WML_NS}}}tbl":
|
||||||
|
# Table – extract rows and cells in order
|
||||||
|
nsmap = {"w": _WML_NS}
|
||||||
|
for row in element.findall("w:tr", nsmap):
|
||||||
|
row_texts: list[str] = []
|
||||||
|
for cell in row.findall("w:tc", nsmap):
|
||||||
|
cell_text = _extract_text_from_element(cell, nsmap).strip()
|
||||||
|
if cell_text:
|
||||||
|
row_texts.append(cell_text)
|
||||||
|
if row_texts:
|
||||||
|
parts.append(" | ".join(row_texts))
|
||||||
|
|
||||||
|
return "\n".join(parts).strip()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue