legco_ai_assistant/backend/app/test/test_phase1_parsers.py

68 lines
1.9 KiB
Python

"""Phase 1.2: Document parsers tests (DOCX and PDF)."""
import os
from pathlib import Path
import pytest
# python-docx may not be installed in all environments. Skip DOCX tests if unavailable.
def test_parse_docx_basic(tmp_path):
# Dynamically create a minimal DOCX with two paragraphs
doc_path = tmp_path / "sample.docx"
try:
from docx import Document as Doc
doc = Doc()
except Exception:
pytest.skip("python-docx not installed, skipping DOCX tests")
doc.add_paragraph("Hello")
doc.add_paragraph("World")
doc.save(str(doc_path))
# Import here to avoid test import side effects
from app.utils.docx_parser import parse_docx
text = parse_docx(str(doc_path))
assert text == "Hello\nWorld"
def test_parse_docx_empty(tmp_path):
doc_path = tmp_path / "empty.docx"
try:
from docx import Document as Doc
doc = Doc()
except Exception:
pytest.skip("python-docx not installed, skipping DOCX tests")
doc.save(str(doc_path))
from app.utils.docx_parser import parse_docx
text = parse_docx(str(doc_path))
assert text == ""
def test_parse_docx_corrupted(tmp_path):
# Create a file with DOCX extension but invalid content
corrupted_path = tmp_path / "corrupted.docx"
corrupted_path.write_bytes(b"not a real docx content")
from app.utils.docx_parser import parse_docx
with pytest.raises(ValueError):
parse_docx(str(corrupted_path))
def test_parse_pdf_empty(tmp_path):
# Create an empty (0-byte) PDF file
pdf_path = tmp_path / "empty.pdf"
pdf_path.write_bytes(b"") # 0 bytes
from app.utils.pdf_parser import parse_pdf
with pytest.raises(ValueError):
parse_pdf(str(pdf_path))
def test_parse_pdf_corrupted(tmp_path):
pdf_path = tmp_path / "corrupted.pdf"
pdf_path.write_bytes(b"not a pdf content")
from app.utils.pdf_parser import parse_pdf
with pytest.raises(ValueError):
parse_pdf(str(pdf_path))