legco_ai_assistant/backend/app/test/test_phase9_evaluate.py

136 lines
5.3 KiB
Python

"""Phase 9 tests: Evaluation API endpoint integration (Sub-Phase 9.3)."""
import json
from unittest.mock import AsyncMock, patch
import pytest
from fastapi import FastAPI
from fastapi.testclient import TestClient
from app.models.testing import (
ChunkAccuracy,
DimensionScores,
EvaluatorConfig,
EvaluationResult,
FilteredResult,
GenerateResult,
InputInfo,
KeyQuestionsEvalEntry,
KeyQuestionsEvalResult,
ResponseResult,
RetrievalResult,
TimingInfo,
)
@pytest.fixture(autouse=True)
def _set_api_keys(monkeypatch):
monkeypatch.setenv("LLM_API_KEY", "test-key")
monkeypatch.setenv("DP_API_KEY", "test-dp-key")
monkeypatch.setenv("DASHSCOPE_API_KEY", "test-dashscope-key")
@pytest.fixture
def client(tmp_path, monkeypatch):
results_dir = str(tmp_path / "test_results")
evals_dir = str(tmp_path / "test_evaluations")
prompts_path = str(tmp_path / "prompts.db")
history_path = str(tmp_path / "history.db")
monkeypatch.setenv("TEST_RESULTS_DIR", results_dir)
monkeypatch.setenv("TEST_EVALUATIONS_DIR", evals_dir)
monkeypatch.setenv("PROMPTS_DB_PATH", prompts_path)
monkeypatch.setenv("HISTORY_DB_PATH", history_path)
monkeypatch.setenv("LLM_API_KEY", "test-key")
monkeypatch.setenv("LLM_BASE_URL", "https://test.example.com/v1")
monkeypatch.setenv("LLM_MODEL_NAME", "test-model")
monkeypatch.setenv("EMBEDDING_MODEL", "test-embedding")
from app.core.config import get_settings
get_settings.cache_clear()
from app.core.sqlite_db import _get_db, init_prompts_db, init_history_db, seed_default_profiles
conn = _get_db(prompts_path)
init_prompts_db(conn)
seed_default_profiles(conn)
conn.close()
hconn = _get_db(history_path)
init_history_db(hconn)
hconn.close()
from app.routers.test_evaluate import router
test_app = FastAPI()
test_app.include_router(router, prefix="/api/v1")
yield TestClient(test_app)
get_settings.cache_clear()
def _make_sample_result():
return GenerateResult(
result_id="test-result-001",
input_type="text",
profile="A",
input=InputInfo(text="test question"),
extracted_key_questions=["q1", "q2"],
retrieval=RetrievalResult(per_sub_question=[], total_chunks_retrieved=10, retriever_time_ms=100),
filtered=FilteredResult(per_sub_question=[], total_chunks_filtered=5, filter_time_ms=100),
response=ResponseResult(final_answer="answer", sub_question_sources=[], generate_time_ms=100),
timing=TimingInfo(decomposer_time_ms=100, retriever_time_ms=100, filter_time_ms=100, generator_time_ms=100, total_time_ms=400),
)
@pytest.fixture
def saved_result(client):
from app.services.test_storage_service import TestStorageService
from app.core.config import get_settings
result = _make_sample_result()
svc = TestStorageService(get_settings().test_results_dir, get_settings().test_evaluations_dir)
svc.save_result(result)
return result.result_id
class TestEvaluateEndpoint:
@pytest.mark.asyncio
async def test_valid_evaluate_returns_200(self, client, saved_result):
mock_scores = DimensionScores(dimension_1_準確性=35.0, dimension_2_完整性=22.0, dimension_3_清晰度=18.0, dimension_4_簡潔性=13.0)
mock_kq = KeyQuestionsEvalResult(
evaluations=[
KeyQuestionsEvalEntry(model_name="m1", scores=mock_scores, total_score=88, max_score=100, comments="ok", thinking_trace="", time_ms=100),
KeyQuestionsEvalEntry(model_name="m2", scores=mock_scores, total_score=88, max_score=100, comments="ok", thinking_trace="", time_ms=100),
],
average_scores=mock_scores,
average_total=88.0,
)
payload = {
"result_id": saved_result,
"evaluation_config": {
"key_questions_evaluators": [
{"model_name": "deepseek-v4-pro", "base_url": "https://api.deepseek.com", "api_key_env": "DP_API_KEY", "enable_thinking": True},
{"model_name": "qwen3-7b-max", "base_url": "https://dashscope.example.com/v1", "api_key_env": "DASHSCOPE_API_KEY", "enable_thinking": True},
],
"chunk_evaluator": {"model_name": "test", "base_url": "https://test.example.com", "api_key_env": "LLM_API_KEY", "enable_thinking": True},
"response_evaluator": {"model_name": "test", "base_url": "https://test.example.com", "api_key_env": "LLM_API_KEY", "enable_thinking": True},
},
}
resp = client.post("/api/v1/test/evaluate", json=payload)
assert resp.status_code == 200
data = resp.json()
assert data["status"] in ("completed", "partial")
assert "evaluation_id" in data
def test_missing_result_returns_404(self, client):
payload = {
"result_id": "no-such-id",
"evaluation_config": {
"key_questions_evaluators": [],
"chunk_evaluator": {"model_name": "t", "base_url": "https://x.com", "api_key_env": "LLM_API_KEY"},
"response_evaluator": {"model_name": "t", "base_url": "https://x.com", "api_key_env": "LLM_API_KEY"},
},
}
resp = client.post("/api/v1/test/evaluate", json=payload)
assert resp.status_code == 404