"""Phase 8 tests: Ingest API integration with strategy selection (Sub-Phase 8.3). Covers: - POST /api/v1/api/v1/ingest?strategy=token — existing behavior unchanged - POST /api/v1/api/v1/ingest?strategy=question — Q&A chunking applied - Invalid strategy values return 400 - IngestResponse includes strategy field - DOCX with Q&A format uses question strategy - Document without Q&A falls back gracefully """ import io import json from typing import List, Tuple from unittest.mock import MagicMock import pytest from fastapi import FastAPI from fastapi.testclient import TestClient from pypdf import PdfWriter from app.routers.ingest import router class _DeterministicEmbedding: def name(self) -> str: return "test_deterministic" def __call__(self, input): return self._embed(input) def embed_query(self, input): return self._embed(input) @staticmethod def _embed(texts): vectors = [] for text in texts: vec = [0.0] * 384 for i, ch in enumerate(text[:384]): vec[i] = ord(ch) / 1000.0 vectors.append(vec) return vectors def _create_real_pdf(content: str) -> bytes: writer = PdfWriter() writer.add_blank_page(width=200, height=200) buf = io.BytesIO() writer.write(buf) return buf.getvalue() def _create_text_txt(content: str) -> bytes: return content.encode("utf-8") @pytest.fixture def client(tmp_path, monkeypatch): """TestClient with real ChromaDB isolated in tmp_path + deterministic embeddings.""" chroma_path = str(tmp_path / "chroma_db") chunk_path = str(tmp_path / "document_chunk") prompts_path = str(tmp_path / "prompts.db") history_path = str(tmp_path / "history.db") monkeypatch.setenv("CHROMA_DB_PATH", chroma_path) monkeypatch.setenv("DOCUMENT_CHUNK_PATH", chunk_path) monkeypatch.setenv("PROMPTS_DB_PATH", prompts_path) monkeypatch.setenv("HISTORY_DB_PATH", history_path) monkeypatch.setenv("EMBEDDING_MODEL", "test-mock") monkeypatch.setenv("LLM_API_KEY", "test-key") from app.core.config import get_settings get_settings.cache_clear() from app.core.dependencies import get_settings_cached get_settings_cached.cache_clear() from app.core.sqlite_db import _get_db, init_prompts_db, init_history_db, seed_default_profiles conn = _get_db(prompts_path) init_prompts_db(conn) seed_default_profiles(conn) conn.close() hconn = _get_db(history_path) init_history_db(hconn) hconn.close() monkeypatch.setattr( "app.core.database.get_embedding_function_settings", lambda settings: _DeterministicEmbedding(), ) test_app = FastAPI() test_app.include_router(router, prefix="/api/v1") yield TestClient(test_app) get_settings_cached.cache_clear() get_settings.cache_clear() def test_ingest_with_strategy_token(client): """Existing behavior unchanged: strategy=token uses TokenChunkingStrategy.""" txt_bytes = _create_text_txt("This is a test document with enough content to generate chunks.") resp = client.post( "/api/v1/ingest?strategy=token", files={"file": ("test.txt", txt_bytes, "text/plain")}, ) assert resp.status_code == 200 data = resp.json() assert data["strategy"] == "token" assert data["chunk_count"] > 0 def test_ingest_invalid_strategy_rejected(client): """Invalid strategy values return 400.""" txt_bytes = _create_text_txt("test") resp = client.post( "/api/v1/ingest?strategy=invalid", files={"file": ("test.txt", txt_bytes, "text/plain")}, ) assert resp.status_code == 400 assert "strategy" in resp.json()["detail"].lower() def test_ingest_response_includes_strategy(client): """IngestResponse includes the strategy field.""" txt_bytes = _create_text_txt("Strategy response test content with more text to ensure chunks.") resp = client.post( "/api/v1/ingest?strategy=token", files={"file": ("test.txt", txt_bytes, "text/plain")}, ) assert resp.status_code == 200 assert "strategy" in resp.json() def test_ingest_default_strategy_is_token(client): """When no strategy param provided, default to token.""" txt_bytes = _create_text_txt("Default strategy test with enough text to generate output.") resp = client.post( "/api/v1/ingest", files={"file": ("test.txt", txt_bytes, "text/plain")}, ) assert resp.status_code == 200 assert resp.json()["strategy"] == "token" def test_ingest_question_strategy_txt(client, monkeypatch): """TXT with Q&A format uses question strategy and produces chunks.""" _mock_question_chunker(monkeypatch) txt_bytes = _create_text_txt("問A1:test question\n答A1:test answer with more text here to ensure chunking works properly.") resp = client.post( "/api/v1/ingest?strategy=question", files={"file": ("test.txt", txt_bytes, "text/plain")}, ) assert resp.status_code == 200 data = resp.json() assert data["strategy"] == "question" assert data["chunk_count"] > 0 def test_ingest_question_strategy_no_qa_fallback(client, monkeypatch): """Document without Q&A markers falls back to narrative chunking without error.""" _mock_question_chunker(monkeypatch) txt_bytes = _create_text_txt("This is plain text without any Q&A markers, but it needs to be long enough to generate at least one chunk when processed by the tokenizer.") resp = client.post( "/api/v1/ingest?strategy=question", files={"file": ("plain.txt", txt_bytes, "text/plain")}, ) assert resp.status_code == 200 data = resp.json() assert data["strategy"] == "question" assert data["chunk_count"] > 0 def _mock_question_chunker(monkeypatch): """Replace QuestionChunkingStrategy with a mock that returns test chunks.""" class _MockQuestionChunker: def __init__(self, settings=None, llm_client=None): self._chunk_metadata = [ { "strategy_type": "question", "section_type": "qa", "question_index": 0, "question_id": "A1", "question_text": "What is X?", "section_heading": "(A) Topic", "answer_contains_table": False, "source_page_range": [1, 2], } ] self._max_tokens = 3000 def chunk(self, text): self._chunk_metadata = self._chunk_metadata[:1] return ["Question: What is X?\n\nAnswer: X is Y."] async def chunk_pages(self, pages, overlap_tokens=0): self._chunk_metadata = self._chunk_metadata[:1] return [("Question: What is X?\n\nAnswer: X is Y.", 1)] monkeypatch.setattr( "app.utils.chunking.QuestionChunkingStrategy", _MockQuestionChunker, )