"""Phase 9 tests: Pydantic models for accuracy testing & evaluation (Sub-Phase 9.0). Covers: - GenerateTextRequest validation - EvaluateRequest validation (result_id vs inline results) - EvaluatorConfig model - Key questions scoring model (dimension_score_1 through dimension_score_4 with correct ranges) - EvaluationResult with all four evaluation types - Chunk evaluation models (precision/recall/F1 comparison) - JSON serialization round-trip """ import json from typing import Optional import pytest from pydantic import ValidationError from app.models.testing import ( GenerateTextRequest, EvaluateRequest, EvaluatorConfig, EvaluationConfigRequest, DimensionScores, GenerateResult, InputInfo, TimingInfo, RetrievalResult, FilteredResult, ResponseResult, SubQuestionChunks, SubQuestionSources, ChunkEntry, AudioEvalResult, KeyQuestionsEvalEntry, KeyQuestionsEvalResult, ChunkAccuracy, GroundTruthInfo, SubQuestionChunkEval, ChunkEvalResult, SubQuestionResponseEval, ResponseEvalResult, EvaluationTiming, EvaluationResult, LIST_RESULTS_RESPONSE, LIST_EVALUATIONS_RESPONSE, TABLE_RESULTS_RESPONSE, TABLE_EVALUATIONS_RESPONSE, ) class TestTextGeneration: def test_valid_request(self): req = GenerateTextRequest( question="test question", profile="A", ) assert req.question == "test question" assert req.profile == "A" assert req.label == "" def test_label_is_optional(self): req = GenerateTextRequest( question="test", profile="B", label="my label", ) assert req.label == "my label" def test_invalid_profile_rejected(self): with pytest.raises(ValidationError) as exc_info: GenerateTextRequest(question="test", profile="D") errors = exc_info.value.errors() assert any("profile" in str(e.get("loc", [])) for e in errors) def test_empty_question_rejected(self): with pytest.raises(ValidationError): GenerateTextRequest(question="", profile="A") class TestEvaluatorConfig: def test_valid_evaluator(self): cfg = EvaluatorConfig( model_name="deepseek-v4-pro", base_url="https://api.deepseek.com", api_key_env="DP_API_KEY", enable_thinking=True, ) assert cfg.model_name == "deepseek-v4-pro" assert cfg.enable_thinking is True def test_thinking_defaults_false(self): cfg = EvaluatorConfig( model_name="test-model", base_url="https://example.com", api_key_env="TEST_KEY", ) assert cfg.enable_thinking is False class TestDimensionScores: def test_valid_scores(self): ds = DimensionScores( dimension_1_準確性=35, dimension_2_完整性=22, dimension_3_清晰度=18, dimension_4_簡潔性=13, ) assert ds.dimension_1_準確性 == 35 assert ds.dimension_2_完整性 == 22 def test_準確性_exceeds_max_rejected(self): with pytest.raises(ValidationError): DimensionScores( dimension_1_準確性=41, dimension_2_完整性=22, dimension_3_清晰度=18, dimension_4_簡潔性=13, ) def test_完整性_exceeds_max_rejected(self): with pytest.raises(ValidationError): DimensionScores( dimension_1_準確性=35, dimension_2_完整性=26, dimension_3_清晰度=18, dimension_4_簡潔性=13, ) def test_negative_score_rejected(self): with pytest.raises(ValidationError): DimensionScores( dimension_1_準確性=-1, dimension_2_完整性=22, dimension_3_清晰度=18, dimension_4_簡潔性=13, ) def test_serialization_preserves_chinese_keys(self): ds = DimensionScores( dimension_1_準確性=35, dimension_2_完整性=22, dimension_3_清晰度=18, dimension_4_簡潔性=13, ) data = json.loads(ds.model_dump_json()) assert data["dimension_1_準確性"] == 35 assert data["dimension_3_清晰度"] == 18 class TestEvaluationRequest: def test_with_result_id(self): req = EvaluateRequest( result_id="abc-123", evaluation_config=EvaluationConfigRequest( key_questions_evaluators=[ EvaluatorConfig( model_name="deepseek-v4-pro", base_url="https://api.deepseek.com", api_key_env="DP_API_KEY", enable_thinking=True, ), EvaluatorConfig( model_name="qwen3-7b-max", base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", api_key_env="DASHSCOPE_API_KEY", enable_thinking=True, ), ], chunk_evaluator=EvaluatorConfig( model_name="qwen/qwen3.6-35b-a3b", base_url="https://openrouter.ai/api/v1", api_key_env="LLM_API_KEY", enable_thinking=True, ), response_evaluator=EvaluatorConfig( model_name="qwen/qwen3.6-35b-a3b", base_url="https://openrouter.ai/api/v1", api_key_env="LLM_API_KEY", enable_thinking=True, ), ), ) assert req.result_id == "abc-123" def test_without_keys_fails(self): with pytest.raises(ValidationError): EvaluateRequest( evaluation_config=EvaluationConfigRequest( key_questions_evaluators=[], chunk_evaluator=EvaluatorConfig( model_name="test", base_url="https://example.com", api_key_env="TEST_KEY", ), response_evaluator=EvaluatorConfig( model_name="test", base_url="https://example.com", api_key_env="TEST_KEY", ), ), ) class TestAudioEvalResult: def test_complete_cer_wer(self): aer = AudioEvalResult( cer=0.052, wer=0.083, reference_length=42, transcribed_length=40, substitutions=1, deletions=2, insertions=0, hits=39, ) assert aer.cer == 0.052 assert aer.status == "completed" def test_na_when_no_reference(self): aer = AudioEvalResult(status="na") assert aer.status == "na" assert aer.cer is None assert aer.wer is None class TestChunkAccuracy: def test_perfect_precision_recall(self): ca = ChunkAccuracy( precision=1.0, recall=1.0, f1=1.0, pipeline_chunks=3, relevant_in_pipeline=3, ) assert ca.f1 == 1.0 def test_zero_precision(self): ca = ChunkAccuracy( precision=0.0, recall=0.8, f1=0.0, pipeline_chunks=5, relevant_in_pipeline=0, ) assert ca.precision == 0.0 assert ca.f1 == 0.0 class TestEvalResult: def test_completed_with_all_dimensions(self): result = EvaluationResult( evaluation_id="eval-001", result_id="result-001", status="completed", audio_evaluation=AudioEvalResult( cer=0.05, wer=0.08, reference_length=42, transcribed_length=40, substitutions=1, deletions=2, insertions=0, hits=39, ), key_questions_evaluation=KeyQuestionsEvalResult( evaluations=[ KeyQuestionsEvalEntry( model_name="deepseek-v4-pro", scores=DimensionScores( dimension_1_準確性=35, dimension_2_完整性=22, dimension_3_清晰度=18, dimension_4_簡潔性=13, ), total_score=88, max_score=100, comments="good", thinking_trace="...", time_ms=3000, ), ], average_scores=DimensionScores( dimension_1_準確性=35, dimension_2_完整性=22, dimension_3_清晰度=18, dimension_4_簡潔性=13, ), average_total=88.0, ), chunk_evaluation=ChunkEvalResult( per_sub_question=[], overall_unfiltered=ChunkAccuracy( precision=0.6, recall=1.0, f1=0.75, pipeline_chunks=5, relevant_in_pipeline=3, ), overall_filtered=ChunkAccuracy( precision=1.0, recall=1.0, f1=1.0, pipeline_chunks=3, relevant_in_pipeline=3, ), ), response_evaluation=ResponseEvalResult( per_sub_question=[], overall_completeness=0.85, overall_factual_accuracy=0.92, ), timing=EvaluationTiming( audio_evaluation_time_ms=23, key_questions_evaluation_time_ms=6000, chunk_evaluation_time_ms=14000, response_evaluation_time_ms=7000, total_evaluation_time_ms=27000, ), ) assert result.status == "completed" assert result.audio_evaluation is not None assert result.key_questions_evaluation is not None def test_failed_status(self): result = EvaluationResult( evaluation_id="eval-002", result_id="result-002", status="failed", timing=EvaluationTiming( audio_evaluation_time_ms=10, key_questions_evaluation_time_ms=0, chunk_evaluation_time_ms=0, response_evaluation_time_ms=0, total_evaluation_time_ms=10, ), ) assert result.status == "failed" def test_serialization_roundtrip(self): result = EvaluationResult( evaluation_id="eval-003", result_id="result-003", status="completed", timing=EvaluationTiming( audio_evaluation_time_ms=10, key_questions_evaluation_time_ms=100, chunk_evaluation_time_ms=200, response_evaluation_time_ms=300, total_evaluation_time_ms=610, ), ) data = json.loads(result.model_dump_json()) assert data["evaluation_id"] == "eval-003" assert data["status"] == "completed" # Round-trip deserialization reloaded = EvaluationResult.model_validate(data) assert reloaded.evaluation_id == "eval-003"