refactor(backend): update document parsers for DOCX and PDF

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
Woody 2026-04-23 13:27:08 +08:00
parent f4d78b0b77
commit 44028ebd6e
2 changed files with 7 additions and 17 deletions

View File

@ -1,17 +1,10 @@
from __future__ import annotations from __future__ import annotations
from typing import Optional try:
Document = None from docx import Document as _Doc
Document = _Doc
except Exception:
def _ensure_docx_imported(): Document = None
global Document
if Document is None:
try:
from docx import Document as _Doc # type: ignore
Document = _Doc
except Exception as exc: # pragma: no cover - missing optional dep
raise ValueError("DOCX library is not installed") from exc
def parse_docx(file_path: str) -> str: def parse_docx(file_path: str) -> str:
@ -23,9 +16,9 @@ def parse_docx(file_path: str) -> str:
Raises: Raises:
ValueError: If the file is not a valid DOCX document or cannot be read. ValueError: If the file is not a valid DOCX document or cannot be read.
""" """
if Document is None:
raise ValueError("DOCX library is not installed")
try: try:
_ensure_docx_imported()
assert Document is not None
doc = Document(file_path) doc = Document(file_path)
except Exception as exc: # pragma: no cover - surface invalid DOCX except Exception as exc: # pragma: no cover - surface invalid DOCX
raise ValueError(f"Invalid DOCX file: {exc}") from exc raise ValueError(f"Invalid DOCX file: {exc}") from exc

View File

@ -1,6 +1,3 @@
from __future__ import annotations
from typing import Optional
from pypdf import PdfReader from pypdf import PdfReader