refactor(backend): update document parsers for DOCX and PDF
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
f4d78b0b77
commit
44028ebd6e
|
|
@ -1,17 +1,10 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
Document = None
|
||||
|
||||
|
||||
def _ensure_docx_imported():
|
||||
global Document
|
||||
if Document is None:
|
||||
try:
|
||||
from docx import Document as _Doc # type: ignore
|
||||
from docx import Document as _Doc
|
||||
Document = _Doc
|
||||
except Exception as exc: # pragma: no cover - missing optional dep
|
||||
raise ValueError("DOCX library is not installed") from exc
|
||||
except Exception:
|
||||
Document = None
|
||||
|
||||
|
||||
def parse_docx(file_path: str) -> str:
|
||||
|
|
@ -23,9 +16,9 @@ def parse_docx(file_path: str) -> str:
|
|||
Raises:
|
||||
ValueError: If the file is not a valid DOCX document or cannot be read.
|
||||
"""
|
||||
if Document is None:
|
||||
raise ValueError("DOCX library is not installed")
|
||||
try:
|
||||
_ensure_docx_imported()
|
||||
assert Document is not None
|
||||
doc = Document(file_path)
|
||||
except Exception as exc: # pragma: no cover - surface invalid DOCX
|
||||
raise ValueError(f"Invalid DOCX file: {exc}") from exc
|
||||
|
|
|
|||
|
|
@ -1,6 +1,3 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
from pypdf import PdfReader
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue