205 lines
6.6 KiB
Python
205 lines
6.6 KiB
Python
"""Phase 1 tests: Document ingestion endpoint.
|
|
|
|
Covers:
|
|
- POST /api/v1/ingest with valid documents (PDF, DOCX, TXT)
|
|
- Metadata extraction (filename, upload_date, content_summary)
|
|
- ChromaDB persistence (verify by querying real collection)
|
|
- Error handling for unsupported file types
|
|
- Error handling for missing file field
|
|
|
|
Uses TestClient + real ChromaDB + real chunking + real metadata extraction.
|
|
Embedding function is mocked with deterministic vectors (external API).
|
|
No LLM calls involved in the ingest pipeline.
|
|
"""
|
|
import io
|
|
import os
|
|
|
|
import pytest
|
|
from fastapi import FastAPI
|
|
from fastapi.testclient import TestClient
|
|
from pypdf import PdfWriter
|
|
|
|
from app.routers.ingest import router
|
|
|
|
|
|
class _DeterministicEmbedding:
|
|
def name(self) -> str:
|
|
return "test_deterministic"
|
|
|
|
def __call__(self, input):
|
|
return self._embed(input)
|
|
|
|
def embed_query(self, input):
|
|
return self._embed(input)
|
|
|
|
@staticmethod
|
|
def _embed(texts):
|
|
vectors = []
|
|
for text in texts:
|
|
vec = [0.0] * 384
|
|
for i, ch in enumerate(text[:384]):
|
|
vec[i] = ord(ch) / 1000.0
|
|
vectors.append(vec)
|
|
return vectors
|
|
|
|
|
|
def _create_real_pdf(content: str) -> bytes:
|
|
from pypdf import PdfWriter
|
|
writer = PdfWriter()
|
|
writer.add_blank_page(width=200, height=200)
|
|
page = writer.pages[0]
|
|
# Add text content via page-level operator (simple approach)
|
|
# pypdf blank pages have no text — we write the content as annotation
|
|
# For testing, we just need a valid PDF; actual text extraction tested separately
|
|
buf = io.BytesIO()
|
|
writer.write(buf)
|
|
return buf.getvalue()
|
|
|
|
|
|
def _create_text_pdf(lines: list[str]) -> bytes:
|
|
"""Create a PDF with actual extractable text using reportlab if available."""
|
|
try:
|
|
from reportlab.pdfgen import canvas as rl_canvas
|
|
buf = io.BytesIO()
|
|
c = rl_canvas.Canvas(buf)
|
|
y = 750
|
|
for line in lines:
|
|
c.drawString(72, y, line)
|
|
y -= 20
|
|
c.save()
|
|
return buf.getvalue()
|
|
except ImportError:
|
|
# Fallback: pypdf blank PDF (no extractable text)
|
|
return _create_real_pdf("")
|
|
|
|
|
|
def _create_real_docx(paragraphs: list[str]) -> bytes:
|
|
try:
|
|
from docx import Document
|
|
doc = Document()
|
|
for para in paragraphs:
|
|
doc.add_paragraph(para)
|
|
buf = io.BytesIO()
|
|
doc.save(buf)
|
|
return buf.getvalue()
|
|
except ImportError:
|
|
return b""
|
|
|
|
|
|
@pytest.fixture
|
|
def client(tmp_path, monkeypatch):
|
|
chroma_path = str(tmp_path / "chroma_db")
|
|
chunk_path = str(tmp_path / "document_chunk")
|
|
prompts_path = str(tmp_path / "prompts.db")
|
|
history_path = str(tmp_path / "history.db")
|
|
|
|
monkeypatch.setenv("CHROMA_DB_PATH", chroma_path)
|
|
monkeypatch.setenv("DOCUMENT_CHUNK_PATH", chunk_path)
|
|
monkeypatch.setenv("PROMPTS_DB_PATH", prompts_path)
|
|
monkeypatch.setenv("HISTORY_DB_PATH", history_path)
|
|
monkeypatch.setenv("EMBEDDING_MODEL", "test-mock")
|
|
monkeypatch.setenv("LLM_API_KEY", "test-key")
|
|
|
|
from app.core.config import get_settings
|
|
get_settings.cache_clear()
|
|
from app.core.dependencies import get_settings_cached
|
|
get_settings_cached.cache_clear()
|
|
|
|
from app.core.sqlite_db import _get_db, init_prompts_db, init_history_db, seed_default_profiles
|
|
conn = _get_db(prompts_path)
|
|
init_prompts_db(conn)
|
|
seed_default_profiles(conn)
|
|
conn.close()
|
|
|
|
hconn = _get_db(history_path)
|
|
init_history_db(hconn)
|
|
hconn.close()
|
|
|
|
monkeypatch.setattr(
|
|
"app.core.database.get_embedding_function_settings",
|
|
lambda settings: _DeterministicEmbedding(),
|
|
)
|
|
|
|
test_app = FastAPI()
|
|
test_app.include_router(router, prefix="/api/v1")
|
|
|
|
yield TestClient(test_app)
|
|
|
|
get_settings_cached.cache_clear()
|
|
get_settings.cache_clear()
|
|
|
|
|
|
class TestIngest:
|
|
|
|
def test_ingest_txt_success(self, client, tmp_path):
|
|
"""Should ingest TXT and return document ID with metadata. Verify real ChromaDB."""
|
|
import chromadb
|
|
from app.core.config import get_settings
|
|
settings = get_settings()
|
|
|
|
response = client.post(
|
|
"/api/v1/ingest",
|
|
files={"file": ("notes.txt", io.BytesIO(b"This is a test document about testing.\nIt has multiple lines of content."), "text/plain")},
|
|
)
|
|
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert "document_id" in data
|
|
assert data["chunk_count"] >= 1
|
|
assert data["filename"] == "notes.txt"
|
|
|
|
# Verify data persisted in real ChromaDB
|
|
db_client = chromadb.PersistentClient(path=settings.chroma_db_path)
|
|
collection = db_client.get_collection("documents")
|
|
all_data = collection.get(include=["metadatas"])
|
|
assert len(all_data["ids"]) >= 1
|
|
filenames = [m["filename"] for m in all_data["metadatas"]]
|
|
assert "notes.txt" in filenames
|
|
|
|
def test_ingest_docx_success(self, client, tmp_path):
|
|
"""Should ingest DOCX and return document ID with metadata."""
|
|
docx_bytes = _create_real_docx(["Paragraph one content.", "Paragraph two content."])
|
|
if not docx_bytes:
|
|
pytest.skip("python-docx not installed")
|
|
|
|
response = client.post(
|
|
"/api/v1/ingest",
|
|
files={"file": ("test.docx", io.BytesIO(docx_bytes),
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document")},
|
|
)
|
|
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert data["chunk_count"] >= 1
|
|
assert data["filename"] == "test.docx"
|
|
|
|
def test_ingest_pdf_success(self, client, tmp_path):
|
|
"""Should ingest PDF and return document ID with metadata."""
|
|
pdf_bytes = _create_text_pdf(["Page 1 line one", "Page 1 line two"])
|
|
|
|
response = client.post(
|
|
"/api/v1/ingest",
|
|
files={"file": ("test.pdf", io.BytesIO(pdf_bytes), "application/pdf")},
|
|
)
|
|
|
|
assert response.status_code == 200
|
|
data = response.json()
|
|
assert "document_id" in data
|
|
assert data["filename"] == "test.pdf"
|
|
|
|
def test_ingest_unsupported_format(self, client):
|
|
"""Should reject unsupported file formats."""
|
|
response = client.post(
|
|
"/api/v1/ingest",
|
|
files={"file": ("test.jpg", io.BytesIO(b"image data"), "image/jpeg")},
|
|
)
|
|
|
|
assert response.status_code == 400
|
|
assert "unsupported" in response.json()["detail"].lower()
|
|
|
|
def test_ingest_no_file(self, client):
|
|
"""Should reject request without file."""
|
|
response = client.post("/api/v1/ingest")
|
|
|
|
assert response.status_code == 422
|