legco_ai_assistant/backend/app/test/test_phase1_ingest.py

"""Phase 1 tests: Document ingestion endpoint.

Covers:
- POST /api/v1/ingest with valid documents
- Metadata extraction (filename, upload_date, content_summary)
- ChromaDB persistence with embeddings
- Error handling for unsupported file types
"""
import pytest
from fastapi.testclient import TestClient
from unittest.mock import MagicMock, patch


class TestIngest:
    """Document upload and ChromaDB ingestion tests."""

    @pytest.fixture
    def client(self):
        """Create test client with mocked dependencies."""
        from app.main import app
        return TestClient(app)

    def test_ingest_pdf_success(self, client, tmp_path):
        """Should ingest PDF and return document ID with metadata."""
        import io

        with patch("app.services.rag.RAGService") as mock_rag_class:
            mock_rag = MagicMock()
            mock_rag.ingest_document.return_value = "doc-123"
            mock_rag.list_documents.return_value = ([], 0, 0)
            mock_rag_class.return_value = mock_rag

            with patch("app.utils.pdf_parser.parse_pdf_by_page") as mock_parse:
                mock_parse.return_value = [(1, "Page 1 text"), (2, "Page 2 text")]

                with patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class:
                    mock_chunker = MagicMock()
                    mock_chunker.chunk_pages.return_value = [("chunk 1", 1), ("chunk 2", 2)]
                    mock_chunk_class.return_value = mock_chunker

                    with patch("app.utils.metadata.extract_metadata") as mock_meta:
                        mock_meta.return_value = [
                            {"filename": "test.pdf", "chunk_index": 0},
                            {"filename": "test.pdf", "chunk_index": 1},
                        ]

                        with patch("app.utils.pdf_extractor.extract_page_as_pdf"):
                            response = client.post(
                                "/api/v1/ingest",
                                files={"file": ("test.pdf", io.BytesIO(b"%PDF-1.4"), "application/pdf")},
                            )

        assert response.status_code == 200
        data = response.json()
        assert "document_id" in data
        assert data["chunk_count"] == 2
        assert data["filename"] == "test.pdf"

    def test_ingest_docx_success(self, client, tmp_path):
        """Should ingest DOCX and return document ID with metadata."""
        import io

        with patch("app.services.rag.RAGService") as mock_rag_class:
            mock_rag = MagicMock()
            mock_rag.ingest_document.return_value = "doc-456"
            mock_rag.list_documents.return_value = ([], 0, 0)
            mock_rag_class.return_value = mock_rag

            with patch("app.utils.docx_parser.parse_docx") as mock_parse:
                mock_parse.return_value = "Parsed DOCX text content"

                with patch("app.utils.chunking.TokenChunkingStrategy") as mock_chunk_class:
                    mock_chunker = MagicMock()
                    mock_chunker.chunk.return_value = ["chunk 1"]
                    mock_chunk_class.return_value = mock_chunker

                    with patch("app.utils.metadata.extract_metadata") as mock_meta:
                        mock_meta.return_value = [{"filename": "test.docx", "chunk_index": 0}]

                        response = client.post(
                            "/api/v1/ingest",
                            files={"file": ("test.docx", io.BytesIO(b"docx content"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document")},
                        )

        assert response.status_code == 200
        data = response.json()
        assert data["chunk_count"] == 1
        assert data["filename"] == "test.docx"

    def test_ingest_unsupported_format(self, client):
        """Should reject unsupported file formats."""
        import io

        response = client.post(
            "/api/v1/ingest",
            files={"file": ("test.jpg", io.BytesIO(b"image data"), "image/jpeg")},
        )

        assert response.status_code == 400
        assert "unsupported" in response.json()["detail"].lower()

    def test_ingest_no_file(self, client):
        """Should reject request without file."""
        response = client.post("/api/v1/ingest")

        assert response.status_code == 422