refactor(backend): update document parsers for DOCX and PDF

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
Woody 2026-04-23 13:27:08 +08:00
parent f4d78b0b77
commit 44028ebd6e
2 changed files with 7 additions and 17 deletions

View File

@ -1,17 +1,10 @@
from __future__ import annotations
from typing import Optional
Document = None
def _ensure_docx_imported():
global Document
if Document is None:
try:
from docx import Document as _Doc # type: ignore
from docx import Document as _Doc
Document = _Doc
except Exception as exc: # pragma: no cover - missing optional dep
raise ValueError("DOCX library is not installed") from exc
except Exception:
Document = None
def parse_docx(file_path: str) -> str:
@ -23,9 +16,9 @@ def parse_docx(file_path: str) -> str:
Raises:
ValueError: If the file is not a valid DOCX document or cannot be read.
"""
if Document is None:
raise ValueError("DOCX library is not installed")
try:
_ensure_docx_imported()
assert Document is not None
doc = Document(file_path)
except Exception as exc: # pragma: no cover - surface invalid DOCX
raise ValueError(f"Invalid DOCX file: {exc}") from exc

View File

@ -1,6 +1,3 @@
from __future__ import annotations
from typing import Optional
from pypdf import PdfReader