diff --git a/backend/app/utils/docx_parser.py b/backend/app/utils/docx_parser.py index 7205cda..c9f0bee 100644 --- a/backend/app/utils/docx_parser.py +++ b/backend/app/utils/docx_parser.py @@ -6,12 +6,22 @@ try: except Exception: Document = None +# WordprocessingML namespace +_WML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + + +def _extract_text_from_element(element, nsmap: dict) -> str: + """Extract all text content from a paragraph or table cell element.""" + texts = element.findall(f".//{{{_WML_NS}}}t", nsmap) + return "".join(t.text or "" for t in texts) + def parse_docx(file_path: str) -> str: """Parse a DOCX file and return its text content. - The function preserves paragraph breaks by inserting a newline between - paragraphs. Empty documents yield an empty string. + Extracts text from both paragraphs and tables in document order, + preserving the original reading sequence. Table rows are joined + with `` | `` separators; paragraphs and rows are separated by newlines. Raises: ValueError: If the file is not a valid DOCX document or cannot be read. @@ -23,6 +33,26 @@ def parse_docx(file_path: str) -> str: except Exception as exc: # pragma: no cover - surface invalid DOCX raise ValueError(f"Invalid DOCX file: {exc}") from exc - paragraphs = [para.text for para in doc.paragraphs if para.text is not None] - # Join with newline to preserve paragraph breaks - return "\n".join(paragraphs).strip() + parts: list[str] = [] + + for element in doc.element.body: + tag = element.tag + + if tag == f"{{{_WML_NS}}}p": + # Paragraph – extract all run text + text = _extract_text_from_element(element, {}) + parts.append(text) + + elif tag == f"{{{_WML_NS}}}tbl": + # Table – extract rows and cells in order + nsmap = {"w": _WML_NS} + for row in element.findall("w:tr", nsmap): + row_texts: list[str] = [] + for cell in row.findall("w:tc", nsmap): + cell_text = _extract_text_from_element(cell, nsmap).strip() + if cell_text: + row_texts.append(cell_text) + if row_texts: + parts.append(" | ".join(row_texts)) + + return "\n".join(parts).strip()