352 lines
11 KiB
Python
352 lines
11 KiB
Python
"""Phase 9 tests: Pydantic models for accuracy testing & evaluation (Sub-Phase 9.0).
|
|
|
|
Covers:
|
|
- GenerateTextRequest validation
|
|
- EvaluateRequest validation (result_id vs inline results)
|
|
- EvaluatorConfig model
|
|
- Key questions scoring model (dimension_score_1 through dimension_score_4 with correct ranges)
|
|
- EvaluationResult with all four evaluation types
|
|
- Chunk evaluation models (precision/recall/F1 comparison)
|
|
- JSON serialization round-trip
|
|
"""
|
|
import json
|
|
from typing import Optional
|
|
|
|
import pytest
|
|
from pydantic import ValidationError
|
|
|
|
from app.models.testing import (
|
|
GenerateTextRequest,
|
|
EvaluateRequest,
|
|
EvaluatorConfig,
|
|
EvaluationConfigRequest,
|
|
DimensionScores,
|
|
GenerateResult,
|
|
InputInfo,
|
|
TimingInfo,
|
|
RetrievalResult,
|
|
FilteredResult,
|
|
ResponseResult,
|
|
SubQuestionChunks,
|
|
SubQuestionSources,
|
|
ChunkEntry,
|
|
AudioEvalResult,
|
|
KeyQuestionsEvalEntry,
|
|
KeyQuestionsEvalResult,
|
|
ChunkAccuracy,
|
|
GroundTruthInfo,
|
|
SubQuestionChunkEval,
|
|
ChunkEvalResult,
|
|
SubQuestionResponseEval,
|
|
ResponseEvalResult,
|
|
EvaluationTiming,
|
|
EvaluationResult,
|
|
LIST_RESULTS_RESPONSE,
|
|
LIST_EVALUATIONS_RESPONSE,
|
|
TABLE_RESULTS_RESPONSE,
|
|
TABLE_EVALUATIONS_RESPONSE,
|
|
)
|
|
|
|
|
|
class TestTextGeneration:
|
|
def test_valid_request(self):
|
|
req = GenerateTextRequest(
|
|
question="test question",
|
|
profile="A",
|
|
)
|
|
assert req.question == "test question"
|
|
assert req.profile == "A"
|
|
assert req.label == ""
|
|
|
|
def test_label_is_optional(self):
|
|
req = GenerateTextRequest(
|
|
question="test",
|
|
profile="B",
|
|
label="my label",
|
|
)
|
|
assert req.label == "my label"
|
|
|
|
def test_invalid_profile_rejected(self):
|
|
with pytest.raises(ValidationError) as exc_info:
|
|
GenerateTextRequest(question="test", profile="D")
|
|
errors = exc_info.value.errors()
|
|
assert any("profile" in str(e.get("loc", [])) for e in errors)
|
|
|
|
def test_empty_question_rejected(self):
|
|
with pytest.raises(ValidationError):
|
|
GenerateTextRequest(question="", profile="A")
|
|
|
|
|
|
class TestEvaluatorConfig:
|
|
def test_valid_evaluator(self):
|
|
cfg = EvaluatorConfig(
|
|
model_name="deepseek-v4-pro",
|
|
base_url="https://api.deepseek.com",
|
|
api_key_env="DP_API_KEY",
|
|
enable_thinking=True,
|
|
)
|
|
assert cfg.model_name == "deepseek-v4-pro"
|
|
assert cfg.enable_thinking is True
|
|
|
|
def test_thinking_defaults_false(self):
|
|
cfg = EvaluatorConfig(
|
|
model_name="test-model",
|
|
base_url="https://example.com",
|
|
api_key_env="TEST_KEY",
|
|
)
|
|
assert cfg.enable_thinking is False
|
|
|
|
|
|
class TestDimensionScores:
|
|
def test_valid_scores(self):
|
|
ds = DimensionScores(
|
|
dimension_1_準確性=35,
|
|
dimension_2_完整性=22,
|
|
dimension_3_清晰度=18,
|
|
dimension_4_簡潔性=13,
|
|
)
|
|
assert ds.dimension_1_準確性 == 35
|
|
assert ds.dimension_2_完整性 == 22
|
|
|
|
def test_準確性_exceeds_max_rejected(self):
|
|
with pytest.raises(ValidationError):
|
|
DimensionScores(
|
|
dimension_1_準確性=41,
|
|
dimension_2_完整性=22,
|
|
dimension_3_清晰度=18,
|
|
dimension_4_簡潔性=13,
|
|
)
|
|
|
|
def test_完整性_exceeds_max_rejected(self):
|
|
with pytest.raises(ValidationError):
|
|
DimensionScores(
|
|
dimension_1_準確性=35,
|
|
dimension_2_完整性=26,
|
|
dimension_3_清晰度=18,
|
|
dimension_4_簡潔性=13,
|
|
)
|
|
|
|
def test_negative_score_rejected(self):
|
|
with pytest.raises(ValidationError):
|
|
DimensionScores(
|
|
dimension_1_準確性=-1,
|
|
dimension_2_完整性=22,
|
|
dimension_3_清晰度=18,
|
|
dimension_4_簡潔性=13,
|
|
)
|
|
|
|
def test_serialization_preserves_chinese_keys(self):
|
|
ds = DimensionScores(
|
|
dimension_1_準確性=35,
|
|
dimension_2_完整性=22,
|
|
dimension_3_清晰度=18,
|
|
dimension_4_簡潔性=13,
|
|
)
|
|
data = json.loads(ds.model_dump_json())
|
|
assert data["dimension_1_準確性"] == 35
|
|
assert data["dimension_3_清晰度"] == 18
|
|
|
|
|
|
class TestEvaluationRequest:
|
|
def test_with_result_id(self):
|
|
req = EvaluateRequest(
|
|
result_id="abc-123",
|
|
evaluation_config=EvaluationConfigRequest(
|
|
key_questions_evaluators=[
|
|
EvaluatorConfig(
|
|
model_name="deepseek-v4-pro",
|
|
base_url="https://api.deepseek.com",
|
|
api_key_env="DP_API_KEY",
|
|
enable_thinking=True,
|
|
),
|
|
EvaluatorConfig(
|
|
model_name="qwen3-7b-max",
|
|
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
|
|
api_key_env="DASHSCOPE_API_KEY",
|
|
enable_thinking=True,
|
|
),
|
|
],
|
|
chunk_evaluator=EvaluatorConfig(
|
|
model_name="qwen/qwen3.6-35b-a3b",
|
|
base_url="https://openrouter.ai/api/v1",
|
|
api_key_env="LLM_API_KEY",
|
|
enable_thinking=True,
|
|
),
|
|
response_evaluator=EvaluatorConfig(
|
|
model_name="qwen/qwen3.6-35b-a3b",
|
|
base_url="https://openrouter.ai/api/v1",
|
|
api_key_env="LLM_API_KEY",
|
|
enable_thinking=True,
|
|
),
|
|
),
|
|
)
|
|
assert req.result_id == "abc-123"
|
|
|
|
def test_without_keys_fails(self):
|
|
with pytest.raises(ValidationError):
|
|
EvaluateRequest(
|
|
evaluation_config=EvaluationConfigRequest(
|
|
key_questions_evaluators=[],
|
|
chunk_evaluator=EvaluatorConfig(
|
|
model_name="test",
|
|
base_url="https://example.com",
|
|
api_key_env="TEST_KEY",
|
|
),
|
|
response_evaluator=EvaluatorConfig(
|
|
model_name="test",
|
|
base_url="https://example.com",
|
|
api_key_env="TEST_KEY",
|
|
),
|
|
),
|
|
)
|
|
|
|
|
|
class TestAudioEvalResult:
|
|
def test_complete_cer_wer(self):
|
|
aer = AudioEvalResult(
|
|
cer=0.052,
|
|
wer=0.083,
|
|
reference_length=42,
|
|
transcribed_length=40,
|
|
substitutions=1,
|
|
deletions=2,
|
|
insertions=0,
|
|
hits=39,
|
|
)
|
|
assert aer.cer == 0.052
|
|
assert aer.status == "completed"
|
|
|
|
def test_na_when_no_reference(self):
|
|
aer = AudioEvalResult(status="na")
|
|
assert aer.status == "na"
|
|
assert aer.cer is None
|
|
assert aer.wer is None
|
|
|
|
|
|
class TestChunkAccuracy:
|
|
def test_perfect_precision_recall(self):
|
|
ca = ChunkAccuracy(
|
|
precision=1.0,
|
|
recall=1.0,
|
|
f1=1.0,
|
|
pipeline_chunks=3,
|
|
relevant_in_pipeline=3,
|
|
)
|
|
assert ca.f1 == 1.0
|
|
|
|
def test_zero_precision(self):
|
|
ca = ChunkAccuracy(
|
|
precision=0.0,
|
|
recall=0.8,
|
|
f1=0.0,
|
|
pipeline_chunks=5,
|
|
relevant_in_pipeline=0,
|
|
)
|
|
assert ca.precision == 0.0
|
|
assert ca.f1 == 0.0
|
|
|
|
|
|
class TestEvalResult:
|
|
def test_completed_with_all_dimensions(self):
|
|
result = EvaluationResult(
|
|
evaluation_id="eval-001",
|
|
result_id="result-001",
|
|
status="completed",
|
|
audio_evaluation=AudioEvalResult(
|
|
cer=0.05,
|
|
wer=0.08,
|
|
reference_length=42,
|
|
transcribed_length=40,
|
|
substitutions=1,
|
|
deletions=2,
|
|
insertions=0,
|
|
hits=39,
|
|
),
|
|
key_questions_evaluation=KeyQuestionsEvalResult(
|
|
evaluations=[
|
|
KeyQuestionsEvalEntry(
|
|
model_name="deepseek-v4-pro",
|
|
scores=DimensionScores(
|
|
dimension_1_準確性=35,
|
|
dimension_2_完整性=22,
|
|
dimension_3_清晰度=18,
|
|
dimension_4_簡潔性=13,
|
|
),
|
|
total_score=88,
|
|
max_score=100,
|
|
comments="good",
|
|
thinking_trace="...",
|
|
time_ms=3000,
|
|
),
|
|
],
|
|
average_scores=DimensionScores(
|
|
dimension_1_準確性=35,
|
|
dimension_2_完整性=22,
|
|
dimension_3_清晰度=18,
|
|
dimension_4_簡潔性=13,
|
|
),
|
|
average_total=88.0,
|
|
),
|
|
chunk_evaluation=ChunkEvalResult(
|
|
per_sub_question=[],
|
|
overall_unfiltered=ChunkAccuracy(
|
|
precision=0.6, recall=1.0, f1=0.75,
|
|
pipeline_chunks=5, relevant_in_pipeline=3,
|
|
),
|
|
overall_filtered=ChunkAccuracy(
|
|
precision=1.0, recall=1.0, f1=1.0,
|
|
pipeline_chunks=3, relevant_in_pipeline=3,
|
|
),
|
|
),
|
|
response_evaluation=ResponseEvalResult(
|
|
per_sub_question=[],
|
|
overall_completeness=0.85,
|
|
overall_factual_accuracy=0.92,
|
|
),
|
|
timing=EvaluationTiming(
|
|
audio_evaluation_time_ms=23,
|
|
key_questions_evaluation_time_ms=6000,
|
|
chunk_evaluation_time_ms=14000,
|
|
response_evaluation_time_ms=7000,
|
|
total_evaluation_time_ms=27000,
|
|
),
|
|
)
|
|
assert result.status == "completed"
|
|
assert result.audio_evaluation is not None
|
|
assert result.key_questions_evaluation is not None
|
|
|
|
def test_failed_status(self):
|
|
result = EvaluationResult(
|
|
evaluation_id="eval-002",
|
|
result_id="result-002",
|
|
status="failed",
|
|
timing=EvaluationTiming(
|
|
audio_evaluation_time_ms=10,
|
|
key_questions_evaluation_time_ms=0,
|
|
chunk_evaluation_time_ms=0,
|
|
response_evaluation_time_ms=0,
|
|
total_evaluation_time_ms=10,
|
|
),
|
|
)
|
|
assert result.status == "failed"
|
|
|
|
def test_serialization_roundtrip(self):
|
|
result = EvaluationResult(
|
|
evaluation_id="eval-003",
|
|
result_id="result-003",
|
|
status="completed",
|
|
timing=EvaluationTiming(
|
|
audio_evaluation_time_ms=10,
|
|
key_questions_evaluation_time_ms=100,
|
|
chunk_evaluation_time_ms=200,
|
|
response_evaluation_time_ms=300,
|
|
total_evaluation_time_ms=610,
|
|
),
|
|
)
|
|
data = json.loads(result.model_dump_json())
|
|
assert data["evaluation_id"] == "eval-003"
|
|
assert data["status"] == "completed"
|
|
# Round-trip deserialization
|
|
reloaded = EvaluationResult.model_validate(data)
|
|
assert reloaded.evaluation_id == "eval-003"
|