"""Phase 1 tests: Document ingestion endpoint. Covers: - POST /api/v1/ingest with valid documents (PDF, DOCX, TXT) - Metadata extraction (filename, upload_date, content_summary) - ChromaDB persistence (verify by querying real collection) - Error handling for unsupported file types - Error handling for missing file field Uses TestClient + real ChromaDB + real chunking + real metadata extraction. Embedding function is mocked with deterministic vectors (external API). No LLM calls involved in the ingest pipeline. """ import io import os import pytest from fastapi import FastAPI from fastapi.testclient import TestClient from pypdf import PdfWriter from app.routers.ingest import router class _DeterministicEmbedding: def name(self) -> str: return "test_deterministic" def __call__(self, input): return self._embed(input) def embed_query(self, input): return self._embed(input) @staticmethod def _embed(texts): vectors = [] for text in texts: vec = [0.0] * 384 for i, ch in enumerate(text[:384]): vec[i] = ord(ch) / 1000.0 vectors.append(vec) return vectors def _create_real_pdf(content: str) -> bytes: from pypdf import PdfWriter writer = PdfWriter() writer.add_blank_page(width=200, height=200) page = writer.pages[0] # Add text content via page-level operator (simple approach) # pypdf blank pages have no text — we write the content as annotation # For testing, we just need a valid PDF; actual text extraction tested separately buf = io.BytesIO() writer.write(buf) return buf.getvalue() def _create_text_pdf(lines: list[str]) -> bytes: """Create a PDF with actual extractable text using reportlab if available.""" try: from reportlab.pdfgen import canvas as rl_canvas buf = io.BytesIO() c = rl_canvas.Canvas(buf) y = 750 for line in lines: c.drawString(72, y, line) y -= 20 c.save() return buf.getvalue() except ImportError: # Fallback: pypdf blank PDF (no extractable text) return _create_real_pdf("") def _create_real_docx(paragraphs: list[str]) -> bytes: try: from docx import Document doc = Document() for para in paragraphs: doc.add_paragraph(para) buf = io.BytesIO() doc.save(buf) return buf.getvalue() except ImportError: return b"" @pytest.fixture def client(tmp_path, monkeypatch): chroma_path = str(tmp_path / "chroma_db") chunk_path = str(tmp_path / "document_chunk") prompts_path = str(tmp_path / "prompts.db") history_path = str(tmp_path / "history.db") monkeypatch.setenv("CHROMA_DB_PATH", chroma_path) monkeypatch.setenv("DOCUMENT_CHUNK_PATH", chunk_path) monkeypatch.setenv("PROMPTS_DB_PATH", prompts_path) monkeypatch.setenv("HISTORY_DB_PATH", history_path) monkeypatch.setenv("EMBEDDING_MODEL", "test-mock") monkeypatch.setenv("LLM_API_KEY", "test-key") from app.core.config import get_settings get_settings.cache_clear() from app.core.dependencies import get_settings_cached get_settings_cached.cache_clear() from app.core.sqlite_db import _get_db, init_prompts_db, init_history_db, seed_default_profiles conn = _get_db(prompts_path) init_prompts_db(conn) seed_default_profiles(conn) conn.close() hconn = _get_db(history_path) init_history_db(hconn) hconn.close() monkeypatch.setattr( "app.core.database.get_embedding_function_settings", lambda settings: _DeterministicEmbedding(), ) test_app = FastAPI() test_app.include_router(router, prefix="/api/v1") yield TestClient(test_app) get_settings_cached.cache_clear() get_settings.cache_clear() class TestIngest: def test_ingest_txt_success(self, client, tmp_path): """Should ingest TXT and return document ID with metadata. Verify real ChromaDB.""" import chromadb from app.core.config import get_settings settings = get_settings() response = client.post( "/api/v1/ingest", files={"file": ("notes.txt", io.BytesIO(b"This is a test document about testing.\nIt has multiple lines of content."), "text/plain")}, ) assert response.status_code == 200 data = response.json() assert "document_id" in data assert data["chunk_count"] >= 1 assert data["filename"] == "notes.txt" # Verify data persisted in real ChromaDB db_client = chromadb.PersistentClient(path=settings.chroma_db_path) collection = db_client.get_collection("documents") all_data = collection.get(include=["metadatas"]) assert len(all_data["ids"]) >= 1 filenames = [m["filename"] for m in all_data["metadatas"]] assert "notes.txt" in filenames def test_ingest_docx_success(self, client, tmp_path): """Should ingest DOCX and return document ID with metadata.""" docx_bytes = _create_real_docx(["Paragraph one content.", "Paragraph two content."]) if not docx_bytes: pytest.skip("python-docx not installed") response = client.post( "/api/v1/ingest", files={"file": ("test.docx", io.BytesIO(docx_bytes), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")}, ) assert response.status_code == 200 data = response.json() assert data["chunk_count"] >= 1 assert data["filename"] == "test.docx" def test_ingest_pdf_success(self, client, tmp_path): """Should ingest PDF and return document ID with metadata.""" pdf_bytes = _create_text_pdf(["Page 1 line one", "Page 1 line two"]) response = client.post( "/api/v1/ingest", files={"file": ("test.pdf", io.BytesIO(pdf_bytes), "application/pdf")}, ) assert response.status_code == 200 data = response.json() assert "document_id" in data assert data["filename"] == "test.pdf" def test_ingest_unsupported_format(self, client): """Should reject unsupported file formats.""" response = client.post( "/api/v1/ingest", files={"file": ("test.jpg", io.BytesIO(b"image data"), "image/jpeg")}, ) assert response.status_code == 400 assert "unsupported" in response.json()["detail"].lower() def test_ingest_no_file(self, client): """Should reject request without file.""" response = client.post("/api/v1/ingest") assert response.status_code == 422