legco_ai_assistant/backend/app/test/test_phase1_ingest.py

205 lines
6.6 KiB
Python

"""Phase 1 tests: Document ingestion endpoint.
Covers:
- POST /api/v1/ingest with valid documents (PDF, DOCX, TXT)
- Metadata extraction (filename, upload_date, content_summary)
- ChromaDB persistence (verify by querying real collection)
- Error handling for unsupported file types
- Error handling for missing file field
Uses TestClient + real ChromaDB + real chunking + real metadata extraction.
Embedding function is mocked with deterministic vectors (external API).
No LLM calls involved in the ingest pipeline.
"""
import io
import os
import pytest
from fastapi import FastAPI
from fastapi.testclient import TestClient
from pypdf import PdfWriter
from app.routers.ingest import router
class _DeterministicEmbedding:
def name(self) -> str:
return "test_deterministic"
def __call__(self, input):
return self._embed(input)
def embed_query(self, input):
return self._embed(input)
@staticmethod
def _embed(texts):
vectors = []
for text in texts:
vec = [0.0] * 384
for i, ch in enumerate(text[:384]):
vec[i] = ord(ch) / 1000.0
vectors.append(vec)
return vectors
def _create_real_pdf(content: str) -> bytes:
from pypdf import PdfWriter
writer = PdfWriter()
writer.add_blank_page(width=200, height=200)
page = writer.pages[0]
# Add text content via page-level operator (simple approach)
# pypdf blank pages have no text — we write the content as annotation
# For testing, we just need a valid PDF; actual text extraction tested separately
buf = io.BytesIO()
writer.write(buf)
return buf.getvalue()
def _create_text_pdf(lines: list[str]) -> bytes:
"""Create a PDF with actual extractable text using reportlab if available."""
try:
from reportlab.pdfgen import canvas as rl_canvas
buf = io.BytesIO()
c = rl_canvas.Canvas(buf)
y = 750
for line in lines:
c.drawString(72, y, line)
y -= 20
c.save()
return buf.getvalue()
except ImportError:
# Fallback: pypdf blank PDF (no extractable text)
return _create_real_pdf("")
def _create_real_docx(paragraphs: list[str]) -> bytes:
try:
from docx import Document
doc = Document()
for para in paragraphs:
doc.add_paragraph(para)
buf = io.BytesIO()
doc.save(buf)
return buf.getvalue()
except ImportError:
return b""
@pytest.fixture
def client(tmp_path, monkeypatch):
chroma_path = str(tmp_path / "chroma_db")
chunk_path = str(tmp_path / "document_chunk")
prompts_path = str(tmp_path / "prompts.db")
history_path = str(tmp_path / "history.db")
monkeypatch.setenv("CHROMA_DB_PATH", chroma_path)
monkeypatch.setenv("DOCUMENT_CHUNK_PATH", chunk_path)
monkeypatch.setenv("PROMPTS_DB_PATH", prompts_path)
monkeypatch.setenv("HISTORY_DB_PATH", history_path)
monkeypatch.setenv("EMBEDDING_MODEL", "test-mock")
monkeypatch.setenv("LLM_API_KEY", "test-key")
from app.core.config import get_settings
get_settings.cache_clear()
from app.core.dependencies import get_settings_cached
get_settings_cached.cache_clear()
from app.core.sqlite_db import _get_db, init_prompts_db, init_history_db, seed_default_profiles
conn = _get_db(prompts_path)
init_prompts_db(conn)
seed_default_profiles(conn)
conn.close()
hconn = _get_db(history_path)
init_history_db(hconn)
hconn.close()
monkeypatch.setattr(
"app.core.database.get_embedding_function_settings",
lambda settings: _DeterministicEmbedding(),
)
test_app = FastAPI()
test_app.include_router(router, prefix="/api/v1")
yield TestClient(test_app)
get_settings_cached.cache_clear()
get_settings.cache_clear()
class TestIngest:
def test_ingest_txt_success(self, client, tmp_path):
"""Should ingest TXT and return document ID with metadata. Verify real ChromaDB."""
import chromadb
from app.core.config import get_settings
settings = get_settings()
response = client.post(
"/api/v1/ingest",
files={"file": ("notes.txt", io.BytesIO(b"This is a test document about testing.\nIt has multiple lines of content."), "text/plain")},
)
assert response.status_code == 200
data = response.json()
assert "document_id" in data
assert data["chunk_count"] >= 1
assert data["filename"] == "notes.txt"
# Verify data persisted in real ChromaDB
db_client = chromadb.PersistentClient(path=settings.chroma_db_path)
collection = db_client.get_collection("documents")
all_data = collection.get(include=["metadatas"])
assert len(all_data["ids"]) >= 1
filenames = [m["filename"] for m in all_data["metadatas"]]
assert "notes.txt" in filenames
def test_ingest_docx_success(self, client, tmp_path):
"""Should ingest DOCX and return document ID with metadata."""
docx_bytes = _create_real_docx(["Paragraph one content.", "Paragraph two content."])
if not docx_bytes:
pytest.skip("python-docx not installed")
response = client.post(
"/api/v1/ingest",
files={"file": ("test.docx", io.BytesIO(docx_bytes),
"application/vnd.openxmlformats-officedocument.wordprocessingml.document")},
)
assert response.status_code == 200
data = response.json()
assert data["chunk_count"] >= 1
assert data["filename"] == "test.docx"
def test_ingest_pdf_success(self, client, tmp_path):
"""Should ingest PDF and return document ID with metadata."""
pdf_bytes = _create_text_pdf(["Page 1 line one", "Page 1 line two"])
response = client.post(
"/api/v1/ingest",
files={"file": ("test.pdf", io.BytesIO(pdf_bytes), "application/pdf")},
)
assert response.status_code == 200
data = response.json()
assert "document_id" in data
assert data["filename"] == "test.pdf"
def test_ingest_unsupported_format(self, client):
"""Should reject unsupported file formats."""
response = client.post(
"/api/v1/ingest",
files={"file": ("test.jpg", io.BytesIO(b"image data"), "image/jpeg")},
)
assert response.status_code == 400
assert "unsupported" in response.json()["detail"].lower()
def test_ingest_no_file(self, client):
"""Should reject request without file."""
response = client.post("/api/v1/ingest")
assert response.status_code == 422