legco_ai_assistant/backend/app/test/test_phase1_ingest.py

107 lines
4.3 KiB
Python

"""Phase 1 tests: Document ingestion endpoint.
Covers:
- POST /api/v1/ingest with valid documents
- Metadata extraction (filename, upload_date, content_summary)
- ChromaDB persistence with embeddings
- Error handling for unsupported file types
"""
import pytest
from fastapi.testclient import TestClient
from unittest.mock import MagicMock, patch
class TestIngest:
"""Document upload and ChromaDB ingestion tests."""
@pytest.fixture
def client(self):
"""Create test client with mocked dependencies."""
from app.main import app
return TestClient(app)
def test_ingest_pdf_success(self, client, tmp_path):
"""Should ingest PDF and return document ID with metadata."""
import io
with patch("app.services.rag.RAGService") as mock_rag_class:
mock_rag = MagicMock()
mock_rag.ingest_document.return_value = "doc-123"
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag
with patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse:
mock_parse.return_value = [(1, "Page 1 text"), (2, "Page 2 text")]
with patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class:
mock_chunker = MagicMock()
mock_chunker.chunk_pages.return_value = [("chunk 1", 1), ("chunk 2", 2)]
mock_chunk_class.return_value = mock_chunker
with patch("app.utils.metadata.extract_metadata") as mock_meta:
mock_meta.return_value = [
{"filename": "test.pdf", "chunk_index": 0},
{"filename": "test.pdf", "chunk_index": 1},
]
with patch("app.utils.pdf_extractor.extract_page_as_pdf"):
response = client.post(
"/api/v1/ingest",
files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
)
assert response.status_code == 200
data = response.json()
assert "document_id" in data
assert data["chunk_count"] == 2
assert data["filename"] == "test.pdf"
def test_ingest_docx_success(self, client, tmp_path):
"""Should ingest DOCX and return document ID with metadata."""
import io
with patch("app.services.rag.RAGService") as mock_rag_class:
mock_rag = MagicMock()
mock_rag.ingest_document.return_value = "doc-456"
mock_rag.list_documents.return_value = ([], 0, 0)
mock_rag_class.return_value = mock_rag
with patch("app.utils.docx_parser.parse_docx") as mock_parse:
mock_parse.return_value = "Parsed DOCX text content"
with patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class:
mock_chunker = MagicMock()
mock_chunker.chunk.return_value = ["chunk 1"]
mock_chunk_class.return_value = mock_chunker
with patch("app.utils.metadata.extract_metadata") as mock_meta:
mock_meta.return_value = [{"filename": "test.docx", "chunk_index": 0}]
response = client.post(
"/api/v1/ingest",
files={"file": ("test.docx", io.BytesIO(b"docx content"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")},
)
assert response.status_code == 200
data = response.json()
assert data["chunk_count"] == 1
assert data["filename"] == "test.docx"
def test_ingest_unsupported_format(self, client):
"""Should reject unsupported file formats."""
import io
response = client.post(
"/api/v1/ingest",
files={"file": ("test.jpg", io.BytesIO(b"image data"), "image/jpeg")},
)
assert response.status_code == 400
assert "unsupported" in response.json()["detail"].lower()
def test_ingest_no_file(self, client):
"""Should reject request without file."""
response = client.post("/api/v1/ingest")
assert response.status_code == 422