107 lines
4.3 KiB
Python
107 lines
4.3 KiB
Python
"""Phase 1 tests: Document ingestion endpoint.
|
|
|
|
Covers:
|
|
- POST /api/v1/ingest with valid documents
|
|
- Metadata extraction (filename, upload_date, content_summary)
|
|
- ChromaDB persistence with embeddings
|
|
- Error handling for unsupported file types
|
|
"""
|
|
import pytest
|
|
from fastapi.testclient import TestClient
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
|
|
class TestIngest:
|
|
"""Document upload and ChromaDB ingestion tests."""
|
|
|
|
@pytest.fixture
|
|
def client(self):
|
|
"""Create test client with mocked dependencies."""
|
|
from app.main import app
|
|
return TestClient(app)
|
|
|
|
def test_ingest_pdf_success(self, client, tmp_path):
|
|
"""Should ingest PDF and return document ID with metadata."""
|
|
import io
|
|
|
|
with patch("app.services.rag.RAGService") as mock_rag_class:
|
|
mock_rag = MagicMock()
|
|
mock_rag.ingest_document.return_value = "doc-123"
|
|
mock_rag.list_documents.return_value = ([], 0, 0)
|
|
mock_rag_class.return_value = mock_rag
|
|
|
|
with patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse:
|
|
mock_parse.return_value = [(1, "Page 1 text"), (2, "Page 2 text")]
|
|
|
|
with patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class:
|
|
mock_chunker = MagicMock()
|
|
mock_chunker.chunk_pages.return_value = [("chunk 1", 1), ("chunk 2", 2)]
|
|
mock_chunk_class.return_value = mock_chunker
|
|
|
|
with patch("app.utils.metadata.extract_metadata") as mock_meta:
|
|
mock_meta.return_value = [
|
|
{"filename": "test.pdf", "chunk_index": 0},
|
|
{"filename": "test.pdf", "chunk_index": 1},
|
|
]
|
|
|
|
with patch("app.utils.pdf_extractor.extract_page_as_pdf"):
|
|
response = client.post(
|
|
"/api/v1/ingest",
|
|
files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
|
|
)
|
|
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert "document_id" in data
|
|
assert data["chunk_count"] == 2
|
|
assert data["filename"] == "test.pdf"
|
|
|
|
def test_ingest_docx_success(self, client, tmp_path):
|
|
"""Should ingest DOCX and return document ID with metadata."""
|
|
import io
|
|
|
|
with patch("app.services.rag.RAGService") as mock_rag_class:
|
|
mock_rag = MagicMock()
|
|
mock_rag.ingest_document.return_value = "doc-456"
|
|
mock_rag.list_documents.return_value = ([], 0, 0)
|
|
mock_rag_class.return_value = mock_rag
|
|
|
|
with patch("app.utils.docx_parser.parse_docx") as mock_parse:
|
|
mock_parse.return_value = "Parsed DOCX text content"
|
|
|
|
with patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class:
|
|
mock_chunker = MagicMock()
|
|
mock_chunker.chunk.return_value = ["chunk 1"]
|
|
mock_chunk_class.return_value = mock_chunker
|
|
|
|
with patch("app.utils.metadata.extract_metadata") as mock_meta:
|
|
mock_meta.return_value = [{"filename": "test.docx", "chunk_index": 0}]
|
|
|
|
response = client.post(
|
|
"/api/v1/ingest",
|
|
files={"file": ("test.docx", io.BytesIO(b"docx content"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")},
|
|
)
|
|
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert data["chunk_count"] == 1
|
|
assert data["filename"] == "test.docx"
|
|
|
|
def test_ingest_unsupported_format(self, client):
|
|
"""Should reject unsupported file formats."""
|
|
import io
|
|
|
|
response = client.post(
|
|
"/api/v1/ingest",
|
|
files={"file": ("test.jpg", io.BytesIO(b"image data"), "image/jpeg")},
|
|
)
|
|
|
|
assert response.status_code == 400
|
|
assert "unsupported" in response.json()["detail"].lower()
|
|
|
|
def test_ingest_no_file(self, client):
|
|
"""Should reject request without file."""
|
|
response = client.post("/api/v1/ingest")
|
|
|
|
assert response.status_code == 422
|