210 lines
6.8 KiB
Python
210 lines
6.8 KiB
Python
"""Phase 8 tests: Ingest API integration with strategy selection (Sub-Phase 8.3).
|
||
|
||
Covers:
|
||
- POST /api/v1/api/v1/ingest?strategy=token — existing behavior unchanged
|
||
- POST /api/v1/api/v1/ingest?strategy=question — Q&A chunking applied
|
||
- Invalid strategy values return 400
|
||
- IngestResponse includes strategy field
|
||
- DOCX with Q&A format uses question strategy
|
||
- Document without Q&A falls back gracefully
|
||
"""
|
||
import io
|
||
import json
|
||
from typing import List, Tuple
|
||
from unittest.mock import MagicMock
|
||
|
||
import pytest
|
||
from fastapi import FastAPI
|
||
from fastapi.testclient import TestClient
|
||
from pypdf import PdfWriter
|
||
|
||
from app.routers.ingest import router
|
||
|
||
|
||
class _DeterministicEmbedding:
|
||
def name(self) -> str:
|
||
return "test_deterministic"
|
||
|
||
def __call__(self, input):
|
||
return self._embed(input)
|
||
|
||
def embed_query(self, input):
|
||
return self._embed(input)
|
||
|
||
@staticmethod
|
||
def _embed(texts):
|
||
vectors = []
|
||
for text in texts:
|
||
vec = [0.0] * 384
|
||
for i, ch in enumerate(text[:384]):
|
||
vec[i] = ord(ch) / 1000.0
|
||
vectors.append(vec)
|
||
return vectors
|
||
|
||
|
||
def _create_real_pdf(content: str) -> bytes:
|
||
writer = PdfWriter()
|
||
writer.add_blank_page(width=200, height=200)
|
||
buf = io.BytesIO()
|
||
writer.write(buf)
|
||
return buf.getvalue()
|
||
|
||
|
||
def _create_text_txt(content: str) -> bytes:
|
||
return content.encode("utf-8")
|
||
|
||
|
||
@pytest.fixture
|
||
def client(tmp_path, monkeypatch):
|
||
"""TestClient with real ChromaDB isolated in tmp_path + deterministic embeddings."""
|
||
chroma_path = str(tmp_path / "chroma_db")
|
||
chunk_path = str(tmp_path / "document_chunk")
|
||
prompts_path = str(tmp_path / "prompts.db")
|
||
history_path = str(tmp_path / "history.db")
|
||
|
||
monkeypatch.setenv("CHROMA_DB_PATH", chroma_path)
|
||
monkeypatch.setenv("DOCUMENT_CHUNK_PATH", chunk_path)
|
||
monkeypatch.setenv("PROMPTS_DB_PATH", prompts_path)
|
||
monkeypatch.setenv("HISTORY_DB_PATH", history_path)
|
||
monkeypatch.setenv("EMBEDDING_MODEL", "test-mock")
|
||
monkeypatch.setenv("LLM_API_KEY", "test-key")
|
||
|
||
from app.core.config import get_settings
|
||
get_settings.cache_clear()
|
||
from app.core.dependencies import get_settings_cached
|
||
get_settings_cached.cache_clear()
|
||
|
||
from app.core.sqlite_db import _get_db, init_prompts_db, init_history_db, seed_default_profiles
|
||
conn = _get_db(prompts_path)
|
||
init_prompts_db(conn)
|
||
seed_default_profiles(conn)
|
||
conn.close()
|
||
|
||
hconn = _get_db(history_path)
|
||
init_history_db(hconn)
|
||
hconn.close()
|
||
|
||
monkeypatch.setattr(
|
||
"app.core.database.get_embedding_function_settings",
|
||
lambda settings: _DeterministicEmbedding(),
|
||
)
|
||
|
||
test_app = FastAPI()
|
||
test_app.include_router(router, prefix="/api/v1")
|
||
|
||
yield TestClient(test_app)
|
||
|
||
get_settings_cached.cache_clear()
|
||
get_settings.cache_clear()
|
||
|
||
|
||
def test_ingest_with_strategy_token(client):
|
||
"""Existing behavior unchanged: strategy=token uses TokenChunkingStrategy."""
|
||
txt_bytes = _create_text_txt("This is a test document with enough content to generate chunks.")
|
||
resp = client.post(
|
||
"/api/v1/ingest?strategy=token",
|
||
files={"file": ("test.txt", txt_bytes, "text/plain")},
|
||
)
|
||
assert resp.status_code == 200
|
||
data = resp.json()
|
||
assert data["strategy"] == "token"
|
||
assert data["chunk_count"] > 0
|
||
|
||
|
||
def test_ingest_invalid_strategy_rejected(client):
|
||
"""Invalid strategy values return 400."""
|
||
txt_bytes = _create_text_txt("test")
|
||
resp = client.post(
|
||
"/api/v1/ingest?strategy=invalid",
|
||
files={"file": ("test.txt", txt_bytes, "text/plain")},
|
||
)
|
||
assert resp.status_code == 400
|
||
assert "strategy" in resp.json()["detail"].lower()
|
||
|
||
|
||
def test_ingest_response_includes_strategy(client):
|
||
"""IngestResponse includes the strategy field."""
|
||
txt_bytes = _create_text_txt("Strategy response test content with more text to ensure chunks.")
|
||
resp = client.post(
|
||
"/api/v1/ingest?strategy=token",
|
||
files={"file": ("test.txt", txt_bytes, "text/plain")},
|
||
)
|
||
assert resp.status_code == 200
|
||
assert "strategy" in resp.json()
|
||
|
||
|
||
def test_ingest_default_strategy_is_token(client):
|
||
"""When no strategy param provided, default to token."""
|
||
txt_bytes = _create_text_txt("Default strategy test with enough text to generate output.")
|
||
resp = client.post(
|
||
"/api/v1/ingest",
|
||
files={"file": ("test.txt", txt_bytes, "text/plain")},
|
||
)
|
||
assert resp.status_code == 200
|
||
assert resp.json()["strategy"] == "token"
|
||
|
||
|
||
def test_ingest_question_strategy_txt(client, monkeypatch):
|
||
"""TXT with Q&A format uses question strategy and produces chunks."""
|
||
_mock_question_chunker(monkeypatch)
|
||
|
||
txt_bytes = _create_text_txt("問A1:test question\n答A1:test answer with more text here to ensure chunking works properly.")
|
||
|
||
resp = client.post(
|
||
"/api/v1/ingest?strategy=question",
|
||
files={"file": ("test.txt", txt_bytes, "text/plain")},
|
||
)
|
||
assert resp.status_code == 200
|
||
data = resp.json()
|
||
assert data["strategy"] == "question"
|
||
assert data["chunk_count"] > 0
|
||
|
||
|
||
def test_ingest_question_strategy_no_qa_fallback(client, monkeypatch):
|
||
"""Document without Q&A markers falls back to narrative chunking without error."""
|
||
_mock_question_chunker(monkeypatch)
|
||
|
||
txt_bytes = _create_text_txt("This is plain text without any Q&A markers, but it needs to be long enough to generate at least one chunk when processed by the tokenizer.")
|
||
|
||
resp = client.post(
|
||
"/api/v1/ingest?strategy=question",
|
||
files={"file": ("plain.txt", txt_bytes, "text/plain")},
|
||
)
|
||
assert resp.status_code == 200
|
||
data = resp.json()
|
||
assert data["strategy"] == "question"
|
||
assert data["chunk_count"] > 0
|
||
|
||
|
||
def _mock_question_chunker(monkeypatch):
|
||
"""Replace QuestionChunkingStrategy with a mock that returns test chunks."""
|
||
|
||
class _MockQuestionChunker:
|
||
def __init__(self, settings=None, llm_client=None):
|
||
self._chunk_metadata = [
|
||
{
|
||
"strategy_type": "question",
|
||
"section_type": "qa",
|
||
"question_index": 0,
|
||
"question_id": "A1",
|
||
"question_text": "What is X?",
|
||
"section_heading": "(A) Topic",
|
||
"answer_contains_table": False,
|
||
"source_page_range": [1, 2],
|
||
}
|
||
]
|
||
self._max_tokens = 3000
|
||
|
||
def chunk(self, text):
|
||
self._chunk_metadata = self._chunk_metadata[:1]
|
||
return ["Question: What is X?\n\nAnswer: X is Y."]
|
||
|
||
def chunk_pages(self, pages, overlap_tokens=0):
|
||
self._chunk_metadata = self._chunk_metadata[:1]
|
||
return [("Question: What is X?\n\nAnswer: X is Y.", 1)]
|
||
|
||
monkeypatch.setattr(
|
||
"app.utils.chunking.QuestionChunkingStrategy",
|
||
_MockQuestionChunker,
|
||
)
|