136 lines
5.3 KiB
Python
136 lines
5.3 KiB
Python
"""Phase 9 tests: Evaluation API endpoint integration (Sub-Phase 9.3)."""
|
|
import json
|
|
from unittest.mock import AsyncMock, patch
|
|
|
|
import pytest
|
|
from fastapi import FastAPI
|
|
from fastapi.testclient import TestClient
|
|
|
|
from app.models.testing import (
|
|
ChunkAccuracy,
|
|
DimensionScores,
|
|
EvaluatorConfig,
|
|
EvaluationResult,
|
|
FilteredResult,
|
|
GenerateResult,
|
|
InputInfo,
|
|
KeyQuestionsEvalEntry,
|
|
KeyQuestionsEvalResult,
|
|
ResponseResult,
|
|
RetrievalResult,
|
|
TimingInfo,
|
|
)
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def _set_api_keys(monkeypatch):
|
|
monkeypatch.setenv("LLM_API_KEY", "test-key")
|
|
monkeypatch.setenv("DP_API_KEY", "test-dp-key")
|
|
monkeypatch.setenv("DASHSCOPE_API_KEY", "test-dashscope-key")
|
|
|
|
|
|
@pytest.fixture
|
|
def client(tmp_path, monkeypatch):
|
|
results_dir = str(tmp_path / "test_results")
|
|
evals_dir = str(tmp_path / "test_evaluations")
|
|
prompts_path = str(tmp_path / "prompts.db")
|
|
history_path = str(tmp_path / "history.db")
|
|
|
|
monkeypatch.setenv("TEST_RESULTS_DIR", results_dir)
|
|
monkeypatch.setenv("TEST_EVALUATIONS_DIR", evals_dir)
|
|
monkeypatch.setenv("PROMPTS_DB_PATH", prompts_path)
|
|
monkeypatch.setenv("HISTORY_DB_PATH", history_path)
|
|
monkeypatch.setenv("LLM_API_KEY", "test-key")
|
|
monkeypatch.setenv("LLM_BASE_URL", "https://test.example.com/v1")
|
|
monkeypatch.setenv("LLM_MODEL_NAME", "test-model")
|
|
monkeypatch.setenv("EMBEDDING_MODEL", "test-embedding")
|
|
|
|
from app.core.config import get_settings
|
|
get_settings.cache_clear()
|
|
|
|
from app.core.sqlite_db import _get_db, init_prompts_db, init_history_db, seed_default_profiles
|
|
conn = _get_db(prompts_path)
|
|
init_prompts_db(conn)
|
|
seed_default_profiles(conn)
|
|
conn.close()
|
|
|
|
hconn = _get_db(history_path)
|
|
init_history_db(hconn)
|
|
hconn.close()
|
|
|
|
from app.routers.test_evaluate import router
|
|
test_app = FastAPI()
|
|
test_app.include_router(router, prefix="/api/v1")
|
|
yield TestClient(test_app)
|
|
|
|
get_settings.cache_clear()
|
|
|
|
|
|
def _make_sample_result():
|
|
return GenerateResult(
|
|
result_id="test-result-001",
|
|
input_type="text",
|
|
profile="A",
|
|
input=InputInfo(text="test question"),
|
|
extracted_key_questions=["q1", "q2"],
|
|
retrieval=RetrievalResult(per_sub_question=[], total_chunks_retrieved=10, retriever_time_ms=100),
|
|
filtered=FilteredResult(per_sub_question=[], total_chunks_filtered=5, filter_time_ms=100),
|
|
response=ResponseResult(final_answer="answer", sub_question_sources=[], generate_time_ms=100),
|
|
timing=TimingInfo(decomposer_time_ms=100, retriever_time_ms=100, filter_time_ms=100, generator_time_ms=100, total_time_ms=400),
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def saved_result(client):
|
|
from app.services.test_storage_service import TestStorageService
|
|
from app.core.config import get_settings
|
|
|
|
result = _make_sample_result()
|
|
svc = TestStorageService(get_settings().test_results_dir, get_settings().test_evaluations_dir)
|
|
svc.save_result(result)
|
|
return result.result_id
|
|
|
|
|
|
class TestEvaluateEndpoint:
|
|
@pytest.mark.asyncio
|
|
async def test_valid_evaluate_returns_200(self, client, saved_result):
|
|
mock_scores = DimensionScores(dimension_1_準確性=35.0, dimension_2_完整性=22.0, dimension_3_清晰度=18.0, dimension_4_簡潔性=13.0)
|
|
mock_kq = KeyQuestionsEvalResult(
|
|
evaluations=[
|
|
KeyQuestionsEvalEntry(model_name="m1", scores=mock_scores, total_score=88, max_score=100, comments="ok", thinking_trace="", time_ms=100),
|
|
KeyQuestionsEvalEntry(model_name="m2", scores=mock_scores, total_score=88, max_score=100, comments="ok", thinking_trace="", time_ms=100),
|
|
],
|
|
average_scores=mock_scores,
|
|
average_total=88.0,
|
|
)
|
|
|
|
payload = {
|
|
"result_id": saved_result,
|
|
"evaluation_config": {
|
|
"key_questions_evaluators": [
|
|
{"model_name": "deepseek-v4-pro", "base_url": "https://api.deepseek.com", "api_key_env": "DP_API_KEY", "enable_thinking": True},
|
|
{"model_name": "qwen3-7b-max", "base_url": "https://dashscope.example.com/v1", "api_key_env": "DASHSCOPE_API_KEY", "enable_thinking": True},
|
|
],
|
|
"chunk_evaluator": {"model_name": "test", "base_url": "https://test.example.com", "api_key_env": "LLM_API_KEY", "enable_thinking": True},
|
|
"response_evaluator": {"model_name": "test", "base_url": "https://test.example.com", "api_key_env": "LLM_API_KEY", "enable_thinking": True},
|
|
},
|
|
}
|
|
|
|
resp = client.post("/api/v1/test/evaluate", json=payload)
|
|
assert resp.status_code == 200
|
|
data = resp.json()
|
|
assert data["status"] in ("completed", "partial")
|
|
assert "evaluation_id" in data
|
|
|
|
def test_missing_result_returns_404(self, client):
|
|
payload = {
|
|
"result_id": "no-such-id",
|
|
"evaluation_config": {
|
|
"key_questions_evaluators": [],
|
|
"chunk_evaluator": {"model_name": "t", "base_url": "https://x.com", "api_key_env": "LLM_API_KEY"},
|
|
"response_evaluator": {"model_name": "t", "base_url": "https://x.com", "api_key_env": "LLM_API_KEY"},
|
|
},
|
|
}
|
|
resp = client.post("/api/v1/test/evaluate", json=payload)
|
|
assert resp.status_code == 404
|