68 lines
1.9 KiB
Python
68 lines
1.9 KiB
Python
"""Phase 1.2: Document parsers tests (DOCX and PDF)."""
|
|
import os
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
# python-docx may not be installed in all environments. Skip DOCX tests if unavailable.
|
|
|
|
|
|
def test_parse_docx_basic(tmp_path):
|
|
# Dynamically create a minimal DOCX with two paragraphs
|
|
doc_path = tmp_path / "sample.docx"
|
|
try:
|
|
from docx import Document as Doc
|
|
doc = Doc()
|
|
except Exception:
|
|
pytest.skip("python-docx not installed, skipping DOCX tests")
|
|
doc.add_paragraph("Hello")
|
|
doc.add_paragraph("World")
|
|
doc.save(str(doc_path))
|
|
|
|
# Import here to avoid test import side effects
|
|
from app.utils.docx_parser import parse_docx
|
|
|
|
text = parse_docx(str(doc_path))
|
|
assert text == "Hello\nWorld"
|
|
|
|
|
|
def test_parse_docx_empty(tmp_path):
|
|
doc_path = tmp_path / "empty.docx"
|
|
try:
|
|
from docx import Document as Doc
|
|
doc = Doc()
|
|
except Exception:
|
|
pytest.skip("python-docx not installed, skipping DOCX tests")
|
|
doc.save(str(doc_path))
|
|
|
|
from app.utils.docx_parser import parse_docx
|
|
text = parse_docx(str(doc_path))
|
|
assert text == ""
|
|
|
|
|
|
def test_parse_docx_corrupted(tmp_path):
|
|
# Create a file with DOCX extension but invalid content
|
|
corrupted_path = tmp_path / "corrupted.docx"
|
|
corrupted_path.write_bytes(b"not a real docx content")
|
|
from app.utils.docx_parser import parse_docx
|
|
with pytest.raises(ValueError):
|
|
parse_docx(str(corrupted_path))
|
|
|
|
|
|
def test_parse_pdf_empty(tmp_path):
|
|
# Create an empty (0-byte) PDF file
|
|
pdf_path = tmp_path / "empty.pdf"
|
|
pdf_path.write_bytes(b"") # 0 bytes
|
|
|
|
from app.utils.pdf_parser import parse_pdf
|
|
with pytest.raises(ValueError):
|
|
parse_pdf(str(pdf_path))
|
|
|
|
|
|
def test_parse_pdf_corrupted(tmp_path):
|
|
pdf_path = tmp_path / "corrupted.pdf"
|
|
pdf_path.write_bytes(b"not a pdf content")
|
|
|
|
from app.utils.pdf_parser import parse_pdf
|
|
with pytest.raises(ValueError):
|
|
parse_pdf(str(pdf_path))
|