"""Phase 1.2: Document parsers tests (DOCX and PDF).""" import os from pathlib import Path import pytest # python-docx may not be installed in all environments. Skip DOCX tests if unavailable. def test_parse_docx_basic(tmp_path): # Dynamically create a minimal DOCX with two paragraphs doc_path = tmp_path / "sample.docx" try: from docx import Document as Doc doc = Doc() except Exception: pytest.skip("python-docx not installed, skipping DOCX tests") doc.add_paragraph("Hello") doc.add_paragraph("World") doc.save(str(doc_path)) # Import here to avoid test import side effects from app.utils.docx_parser import parse_docx text = parse_docx(str(doc_path)) assert text == "Hello\nWorld" def test_parse_docx_empty(tmp_path): doc_path = tmp_path / "empty.docx" try: from docx import Document as Doc doc = Doc() except Exception: pytest.skip("python-docx not installed, skipping DOCX tests") doc.save(str(doc_path)) from app.utils.docx_parser import parse_docx text = parse_docx(str(doc_path)) assert text == "" def test_parse_docx_corrupted(tmp_path): # Create a file with DOCX extension but invalid content corrupted_path = tmp_path / "corrupted.docx" corrupted_path.write_bytes(b"not a real docx content") from app.utils.docx_parser import parse_docx with pytest.raises(ValueError): parse_docx(str(corrupted_path)) def test_parse_pdf_empty(tmp_path): # Create an empty (0-byte) PDF file pdf_path = tmp_path / "empty.pdf" pdf_path.write_bytes(b"") # 0 bytes from app.utils.pdf_parser import parse_pdf with pytest.raises(ValueError): parse_pdf(str(pdf_path)) def test_parse_pdf_corrupted(tmp_path): pdf_path = tmp_path / "corrupted.pdf" pdf_path.write_bytes(b"not a pdf content") from app.utils.pdf_parser import parse_pdf with pytest.raises(ValueError): parse_pdf(str(pdf_path))