legco_ai_assistant/backend/app/test/test_phase1_ingest.py

"""Phase 1 tests: Document ingestion endpoint.

Covers:
- POST /api/v1/ingest with valid documents (PDF, DOCX, TXT)
- Metadata extraction (filename, upload_date, content_summary)
- ChromaDB persistence (verify by querying real collection)
- Error handling for unsupported file types
- Error handling for missing file field

Uses TestClient + real ChromaDB + real chunking + real metadata extraction.
Embedding function is mocked with deterministic vectors (external API).
No LLM calls involved in the ingest pipeline.
"""
import io
import os

import pytest
from fastapi import FastAPI
from fastapi.testclient import TestClient
from pypdf import PdfWriter

from app.routers.ingest import router


class _DeterministicEmbedding:
    def name(self) -> str:
        return "test_deterministic"

    def __call__(self, input):
        return self._embed(input)

    def embed_query(self, input):
        return self._embed(input)

    @staticmethod
    def _embed(texts):
        vectors = []
        for text in texts:
            vec = [0.0] * 384
            for i, ch in enumerate(text[:384]):
                vec[i] = ord(ch) / 1000.0
            vectors.append(vec)
        return vectors


def _create_real_pdf(content: str) -> bytes:
    from pypdf import PdfWriter
    writer = PdfWriter()
    writer.add_blank_page(width=200, height=200)
    page = writer.pages[0]
    # Add text content via page-level operator (simple approach)
    # pypdf blank pages have no text — we write the content as annotation
    # For testing, we just need a valid PDF; actual text extraction tested separately
    buf = io.BytesIO()
    writer.write(buf)
    return buf.getvalue()


def _create_text_pdf(lines: list[str]) -> bytes:
    """Create a PDF with actual extractable text using reportlab if available."""
    try:
        from reportlab.pdfgen import canvas as rl_canvas
        buf = io.BytesIO()
        c = rl_canvas.Canvas(buf)
        y = 750
        for line in lines:
            c.drawString(72, y, line)
            y -= 20
        c.save()
        return buf.getvalue()
    except ImportError:
        # Fallback: pypdf blank PDF (no extractable text)
        return _create_real_pdf("")


def _create_real_docx(paragraphs: list[str]) -> bytes:
    try:
        from docx import Document
        doc = Document()
        for para in paragraphs:
            doc.add_paragraph(para)
        buf = io.BytesIO()
        doc.save(buf)
        return buf.getvalue()
    except ImportError:
        return b""


@pytest.fixture
def client(tmp_path, monkeypatch):
    chroma_path = str(tmp_path / "chroma_db")
    chunk_path = str(tmp_path / "document_chunk")
    prompts_path = str(tmp_path / "prompts.db")
    history_path = str(tmp_path / "history.db")

    monkeypatch.setenv("CHROMA_DB_PATH", chroma_path)
    monkeypatch.setenv("DOCUMENT_CHUNK_PATH", chunk_path)
    monkeypatch.setenv("PROMPTS_DB_PATH", prompts_path)
    monkeypatch.setenv("HISTORY_DB_PATH", history_path)
    monkeypatch.setenv("EMBEDDING_MODEL", "test-mock")
    monkeypatch.setenv("LLM_API_KEY", "test-key")

    from app.core.config import get_settings
    get_settings.cache_clear()
    from app.core.dependencies import get_settings_cached
    get_settings_cached.cache_clear()

    from app.core.sqlite_db import _get_db, init_prompts_db, init_history_db, seed_default_profiles
    conn = _get_db(prompts_path)
    init_prompts_db(conn)
    seed_default_profiles(conn)
    conn.close()

    hconn = _get_db(history_path)
    init_history_db(hconn)
    hconn.close()

    monkeypatch.setattr(
        "app.core.database.get_embedding_function_settings",
        lambda settings: _DeterministicEmbedding(),
    )

    test_app = FastAPI()
    test_app.include_router(router, prefix="/api/v1")

    yield TestClient(test_app)

    get_settings_cached.cache_clear()
    get_settings.cache_clear()


class TestIngest:

    def test_ingest_txt_success(self, client, tmp_path):
        """Should ingest TXT and return document ID with metadata. Verify real ChromaDB."""
        import chromadb
        from app.core.config import get_settings
        settings = get_settings()

        response = client.post(
            "/api/v1/ingest",
            files={"file": ("notes.txt", io.BytesIO(b"This is a test document about testing.\nIt has multiple lines of content."), "text/plain")},
        )

        assert response.status_code == 200
        data = response.json()
        assert "document_id" in data
        assert data["chunk_count"] >= 1
        assert data["filename"] == "notes.txt"

        # Verify data persisted in real ChromaDB
        db_client = chromadb.PersistentClient(path=settings.chroma_db_path)
        collection = db_client.get_collection("documents")
        all_data = collection.get(include=["metadatas"])
        assert len(all_data["ids"]) >= 1
        filenames = [m["filename"] for m in all_data["metadatas"]]
        assert "notes.txt" in filenames

    def test_ingest_docx_success(self, client, tmp_path):
        """Should ingest DOCX and return document ID with metadata."""
        docx_bytes = _create_real_docx(["Paragraph one content.", "Paragraph two content."])
        if not docx_bytes:
            pytest.skip("python-docx not installed")

        response = client.post(
            "/api/v1/ingest",
            files={"file": ("test.docx", io.BytesIO(docx_bytes),
                            "application/vnd.openxmlformats-officedocument.wordprocessingml.document")},
        )

        assert response.status_code == 200
        data = response.json()
        assert data["chunk_count"] >= 1
        assert data["filename"] == "test.docx"

    def test_ingest_pdf_success(self, client, tmp_path):
        """Should ingest PDF and return document ID with metadata."""
        pdf_bytes = _create_text_pdf(["Page 1 line one", "Page 1 line two"])

        response = client.post(
            "/api/v1/ingest",
            files={"file": ("test.pdf", io.BytesIO(pdf_bytes), "application/pdf")},
        )

        assert response.status_code == 200
        data = response.json()
        assert "document_id" in data
        assert data["filename"] == "test.pdf"

    def test_ingest_unsupported_format(self, client):
        """Should reject unsupported file formats."""
        response = client.post(
            "/api/v1/ingest",
            files={"file": ("test.jpg", io.BytesIO(b"image data"), "image/jpeg")},
        )

        assert response.status_code == 400
        assert "unsupported" in response.json()["detail"].lower()

    def test_ingest_no_file(self, client):
        """Should reject request without file."""
        response = client.post("/api/v1/ingest")

        assert response.status_code == 422