refactor(backend): update document parsers for DOCX and PDF
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
f4d78b0b77
commit
44028ebd6e
|
|
@ -1,17 +1,10 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
Document = None
|
|
||||||
|
|
||||||
|
|
||||||
def _ensure_docx_imported():
|
|
||||||
global Document
|
|
||||||
if Document is None:
|
|
||||||
try:
|
try:
|
||||||
from docx import Document as _Doc # type: ignore
|
from docx import Document as _Doc
|
||||||
Document = _Doc
|
Document = _Doc
|
||||||
except Exception as exc: # pragma: no cover - missing optional dep
|
except Exception:
|
||||||
raise ValueError("DOCX library is not installed") from exc
|
Document = None
|
||||||
|
|
||||||
|
|
||||||
def parse_docx(file_path: str) -> str:
|
def parse_docx(file_path: str) -> str:
|
||||||
|
|
@ -23,9 +16,9 @@ def parse_docx(file_path: str) -> str:
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If the file is not a valid DOCX document or cannot be read.
|
ValueError: If the file is not a valid DOCX document or cannot be read.
|
||||||
"""
|
"""
|
||||||
|
if Document is None:
|
||||||
|
raise ValueError("DOCX library is not installed")
|
||||||
try:
|
try:
|
||||||
_ensure_docx_imported()
|
|
||||||
assert Document is not None
|
|
||||||
doc = Document(file_path)
|
doc = Document(file_path)
|
||||||
except Exception as exc: # pragma: no cover - surface invalid DOCX
|
except Exception as exc: # pragma: no cover - surface invalid DOCX
|
||||||
raise ValueError(f"Invalid DOCX file: {exc}") from exc
|
raise ValueError(f"Invalid DOCX file: {exc}") from exc
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,3 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
from pypdf import PdfReader
|
from pypdf import PdfReader
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue