feat: add Sub-Phase 9.0 config and Pydantic models for accuracy testing
This commit is contained in:
parent
7dfd603bc8
commit
852430f1f1
|
|
@ -64,3 +64,13 @@ QA_MAX_CHUNK_TOKENS=3000
|
|||
QA_STRUCTURE_MODEL=
|
||||
QA_INCLUDE_INTERNAL_REFS=true
|
||||
QA_CACHE_VISION_RESULTS=true
|
||||
|
||||
# Test results storage (Package 9)
|
||||
TEST_RESULTS_DIR=./data/test_results
|
||||
TEST_EVALUATIONS_DIR=./data/test_evaluations
|
||||
|
||||
# Evaluation batching (Package 9)
|
||||
EVAL_CHUNK_BATCH_SIZE=10
|
||||
EVAL_MAX_CONCURRENT_BATCHES=10
|
||||
EVAL_BATCH_RETRY_COUNT=2
|
||||
EVAL_BATCH_RETRY_DELAY_MS=2000
|
||||
|
|
|
|||
|
|
@ -52,6 +52,14 @@ class Settings(BaseSettings):
|
|||
qa_include_internal_refs: bool = True
|
||||
qa_cache_vision_results: bool = True
|
||||
|
||||
# Accuracy testing & evaluation (Package 9)
|
||||
test_results_dir: str = "./data/test_results"
|
||||
test_evaluations_dir: str = "./data/test_evaluations"
|
||||
eval_chunk_batch_size: int = 10
|
||||
eval_max_concurrent_batches: int = 10
|
||||
eval_batch_retry_count: int = 2
|
||||
eval_batch_retry_delay_ms: int = 2000
|
||||
|
||||
# ASR Configuration (Phase 2 + Phase 5)
|
||||
# Provider: "dashscope" (batch + realtime) or "openrouter" (batch-only)
|
||||
asr_provider: str = "dashscope"
|
||||
|
|
|
|||
|
|
@ -0,0 +1,243 @@
|
|||
from datetime import datetime, timezone
|
||||
from typing import Any, Dict, List, Literal, Optional
|
||||
|
||||
from pydantic import BaseModel, Field, model_validator
|
||||
|
||||
from app.models.common import SourceMetadata
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Request models
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class GenerateTextRequest(BaseModel):
|
||||
question: str = Field(..., min_length=1)
|
||||
profile: Literal["A", "B", "C"]
|
||||
label: str = ""
|
||||
|
||||
|
||||
class EvaluatorConfig(BaseModel):
|
||||
model_name: str
|
||||
base_url: str
|
||||
api_key_env: str
|
||||
enable_thinking: bool = False
|
||||
|
||||
|
||||
class EvaluationConfigRequest(BaseModel):
|
||||
key_questions_evaluators: List[EvaluatorConfig]
|
||||
chunk_evaluator: EvaluatorConfig
|
||||
response_evaluator: EvaluatorConfig
|
||||
|
||||
|
||||
class EvaluateRequest(BaseModel):
|
||||
result_id: str = ""
|
||||
results: Optional["GenerateResult"] = None
|
||||
evaluation_config: EvaluationConfigRequest
|
||||
|
||||
@model_validator(mode="after")
|
||||
def _require_result_source(self) -> "EvaluateRequest":
|
||||
if not self.result_id and self.results is None:
|
||||
raise ValueError("Either result_id or inline results must be provided")
|
||||
return self
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Result models (output of generation endpoints)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class ChunkEntry(BaseModel):
|
||||
chunk_index: int
|
||||
text: str
|
||||
metadata: Dict[str, Any]
|
||||
distance: float = 0.0
|
||||
|
||||
|
||||
class SubQuestionChunks(BaseModel):
|
||||
sub_question_index: int
|
||||
sub_question_text: str
|
||||
chunks: List[ChunkEntry]
|
||||
|
||||
|
||||
class RetrievalResult(BaseModel):
|
||||
per_sub_question: List[SubQuestionChunks]
|
||||
total_chunks_retrieved: int
|
||||
retriever_time_ms: int
|
||||
|
||||
|
||||
class FilteredResult(BaseModel):
|
||||
per_sub_question: List[SubQuestionChunks]
|
||||
total_chunks_filtered: int
|
||||
filter_time_ms: int
|
||||
|
||||
|
||||
class SubQuestionSources(BaseModel):
|
||||
sub_question_index: int
|
||||
sub_question_text: str
|
||||
sources: List[SourceMetadata]
|
||||
|
||||
|
||||
class ResponseResult(BaseModel):
|
||||
final_answer: str
|
||||
sub_question_sources: List[SubQuestionSources]
|
||||
generate_time_ms: int
|
||||
|
||||
|
||||
class InputInfo(BaseModel):
|
||||
text: str
|
||||
reference_transcript: str = ""
|
||||
audio_filename: str = ""
|
||||
audio_duration_seconds: float = 0.0
|
||||
asr_language: str = ""
|
||||
|
||||
|
||||
class TimingInfo(BaseModel):
|
||||
decomposer_time_ms: int
|
||||
retriever_time_ms: int
|
||||
filter_time_ms: int
|
||||
generator_time_ms: int
|
||||
total_time_ms: int
|
||||
asr_time_ms: int = 0
|
||||
|
||||
|
||||
class GenerateResult(BaseModel):
|
||||
result_id: str
|
||||
input_type: Literal["text", "audio"]
|
||||
profile: str
|
||||
label: str = ""
|
||||
created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
||||
input: InputInfo
|
||||
extracted_key_questions: List[str]
|
||||
retrieval: RetrievalResult
|
||||
filtered: FilteredResult
|
||||
response: ResponseResult
|
||||
timing: TimingInfo
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Evaluation models
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class DimensionScores(BaseModel):
|
||||
dimension_1_準確性: int = Field(ge=0, le=40)
|
||||
dimension_2_完整性: int = Field(ge=0, le=25)
|
||||
dimension_3_清晰度: int = Field(ge=0, le=20)
|
||||
dimension_4_簡潔性: int = Field(ge=0, le=15)
|
||||
|
||||
|
||||
class KeyQuestionsEvalEntry(BaseModel):
|
||||
model_name: str
|
||||
scores: DimensionScores
|
||||
total_score: int = Field(ge=0, le=100)
|
||||
max_score: int = 100
|
||||
comments: str
|
||||
thinking_trace: str
|
||||
time_ms: int
|
||||
|
||||
|
||||
class KeyQuestionsEvalResult(BaseModel):
|
||||
evaluations: List[KeyQuestionsEvalEntry]
|
||||
average_scores: DimensionScores
|
||||
average_total: float
|
||||
|
||||
|
||||
class AudioEvalResult(BaseModel):
|
||||
status: Literal["completed", "na"] = "completed"
|
||||
cer: Optional[float] = None
|
||||
wer: Optional[float] = None
|
||||
reference_length: int = 0
|
||||
transcribed_length: int = 0
|
||||
substitutions: int = 0
|
||||
deletions: int = 0
|
||||
insertions: int = 0
|
||||
hits: int = 0
|
||||
|
||||
|
||||
class ChunkAccuracy(BaseModel):
|
||||
precision: float
|
||||
recall: float
|
||||
f1: float
|
||||
pipeline_chunks: int
|
||||
relevant_in_pipeline: int
|
||||
|
||||
|
||||
class GroundTruthInfo(BaseModel):
|
||||
relevant_documents: List[str]
|
||||
relevant_chunks: List[Dict[str, Any]]
|
||||
total_relevant_chunks: int
|
||||
chunk_evaluation_time_ms: int
|
||||
|
||||
|
||||
class SubQuestionChunkEval(BaseModel):
|
||||
sub_question_index: int
|
||||
sub_question_text: str
|
||||
ground_truth: GroundTruthInfo
|
||||
unfiltered_accuracy: ChunkAccuracy
|
||||
filtered_accuracy: ChunkAccuracy
|
||||
|
||||
|
||||
class ChunkEvalResult(BaseModel):
|
||||
per_sub_question: List[SubQuestionChunkEval]
|
||||
overall_unfiltered: ChunkAccuracy
|
||||
overall_filtered: ChunkAccuracy
|
||||
|
||||
|
||||
class SubQuestionResponseEval(BaseModel):
|
||||
sub_question_index: int
|
||||
sub_question_text: str
|
||||
ground_truth_response: str
|
||||
pipeline_response_section: str
|
||||
completeness_score: float
|
||||
factual_accuracy_score: float
|
||||
comments: str
|
||||
ground_truth_generation_time_ms: int
|
||||
comparison_time_ms: int
|
||||
|
||||
|
||||
class ResponseEvalResult(BaseModel):
|
||||
per_sub_question: List[SubQuestionResponseEval]
|
||||
overall_completeness: float
|
||||
overall_factual_accuracy: float
|
||||
|
||||
|
||||
class EvaluationTiming(BaseModel):
|
||||
audio_evaluation_time_ms: int
|
||||
key_questions_evaluation_time_ms: int
|
||||
chunk_evaluation_time_ms: int
|
||||
response_evaluation_time_ms: int
|
||||
total_evaluation_time_ms: int
|
||||
|
||||
|
||||
class EvaluationResult(BaseModel):
|
||||
evaluation_id: str
|
||||
result_id: str
|
||||
created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
||||
status: Literal["completed", "partial", "failed"]
|
||||
audio_evaluation: Optional[AudioEvalResult] = None
|
||||
key_questions_evaluation: Optional[KeyQuestionsEvalResult] = None
|
||||
chunk_evaluation: Optional[ChunkEvalResult] = None
|
||||
response_evaluation: Optional[ResponseEvalResult] = None
|
||||
timing: EvaluationTiming
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Listing response models
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class ListItemInfo(BaseModel):
|
||||
result_id: str
|
||||
input_type: str
|
||||
profile: str
|
||||
label: str
|
||||
created_at: str
|
||||
file_size_bytes: int
|
||||
|
||||
|
||||
LIST_RESULTS_RESPONSE = List[ListItemInfo]
|
||||
LIST_EVALUATIONS_RESPONSE = List[ListItemInfo]
|
||||
|
||||
TABLE_RESULTS_RESPONSE = ListItemInfo
|
||||
TABLE_EVALUATIONS_RESPONSE = ListItemInfo
|
||||
|
|
@ -0,0 +1,74 @@
|
|||
"""Phase 9 tests: Config settings for accuracy testing & evaluation (Sub-Phase 9.0).
|
||||
|
||||
Covers:
|
||||
- test_results_dir and test_evaluations_dir default values
|
||||
- eval_chunk_batch_size = 10 (fixed)
|
||||
- eval_max_concurrent_batches = 10 (rate limiting)
|
||||
- eval_batch_retry_count = 2
|
||||
- eval_batch_retry_delay_ms = 2000
|
||||
- Env var overrides work correctly
|
||||
"""
|
||||
|
||||
from app.core.config import Settings
|
||||
|
||||
|
||||
class TestEvalConfigDefaults:
|
||||
"""Default values for new Package 9 evaluation settings."""
|
||||
|
||||
def test_default_results_dir(self):
|
||||
s = Settings()
|
||||
assert s.test_results_dir == "./data/test_results"
|
||||
|
||||
def test_default_evaluations_dir(self):
|
||||
s = Settings()
|
||||
assert s.test_evaluations_dir == "./data/test_evaluations"
|
||||
|
||||
def test_default_chunk_batch_size(self):
|
||||
s = Settings()
|
||||
assert s.eval_chunk_batch_size == 10
|
||||
|
||||
def test_default_max_concurrent_batches(self):
|
||||
s = Settings()
|
||||
assert s.eval_max_concurrent_batches == 10
|
||||
|
||||
def test_default_batch_retry_count(self):
|
||||
s = Settings()
|
||||
assert s.eval_batch_retry_count == 2
|
||||
|
||||
def test_default_batch_retry_delay_ms(self):
|
||||
s = Settings()
|
||||
assert s.eval_batch_retry_delay_ms == 2000
|
||||
|
||||
|
||||
class TestEvalConfigEnvOverrides:
|
||||
"""Environment variable overrides for evaluation settings."""
|
||||
|
||||
def test_results_dir_from_env(self, monkeypatch):
|
||||
monkeypatch.setenv("TEST_RESULTS_DIR", "/tmp/test_results")
|
||||
s = Settings()
|
||||
assert s.test_results_dir == "/tmp/test_results"
|
||||
|
||||
def test_evaluations_dir_from_env(self, monkeypatch):
|
||||
monkeypatch.setenv("TEST_EVALUATIONS_DIR", "/tmp/test_eval")
|
||||
s = Settings()
|
||||
assert s.test_evaluations_dir == "/tmp/test_eval"
|
||||
|
||||
def test_chunk_batch_size_from_env(self, monkeypatch):
|
||||
monkeypatch.setenv("EVAL_CHUNK_BATCH_SIZE", "5")
|
||||
s = Settings()
|
||||
assert s.eval_chunk_batch_size == 5
|
||||
|
||||
def test_max_concurrent_batches_from_env(self, monkeypatch):
|
||||
monkeypatch.setenv("EVAL_MAX_CONCURRENT_BATCHES", "5")
|
||||
s = Settings()
|
||||
assert s.eval_max_concurrent_batches == 5
|
||||
|
||||
def test_batch_retry_count_from_env(self, monkeypatch):
|
||||
monkeypatch.setenv("EVAL_BATCH_RETRY_COUNT", "3")
|
||||
s = Settings()
|
||||
assert s.eval_batch_retry_count == 3
|
||||
|
||||
def test_batch_retry_delay_ms_from_env(self, monkeypatch):
|
||||
monkeypatch.setenv("EVAL_BATCH_RETRY_DELAY_MS", "5000")
|
||||
s = Settings()
|
||||
assert s.eval_batch_retry_delay_ms == 5000
|
||||
|
|
@ -0,0 +1,351 @@
|
|||
"""Phase 9 tests: Pydantic models for accuracy testing & evaluation (Sub-Phase 9.0).
|
||||
|
||||
Covers:
|
||||
- GenerateTextRequest validation
|
||||
- EvaluateRequest validation (result_id vs inline results)
|
||||
- EvaluatorConfig model
|
||||
- Key questions scoring model (dimension_score_1 through dimension_score_4 with correct ranges)
|
||||
- EvaluationResult with all four evaluation types
|
||||
- Chunk evaluation models (precision/recall/F1 comparison)
|
||||
- JSON serialization round-trip
|
||||
"""
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from app.models.testing import (
|
||||
GenerateTextRequest,
|
||||
EvaluateRequest,
|
||||
EvaluatorConfig,
|
||||
EvaluationConfigRequest,
|
||||
DimensionScores,
|
||||
GenerateResult,
|
||||
InputInfo,
|
||||
TimingInfo,
|
||||
RetrievalResult,
|
||||
FilteredResult,
|
||||
ResponseResult,
|
||||
SubQuestionChunks,
|
||||
SubQuestionSources,
|
||||
ChunkEntry,
|
||||
AudioEvalResult,
|
||||
KeyQuestionsEvalEntry,
|
||||
KeyQuestionsEvalResult,
|
||||
ChunkAccuracy,
|
||||
GroundTruthInfo,
|
||||
SubQuestionChunkEval,
|
||||
ChunkEvalResult,
|
||||
SubQuestionResponseEval,
|
||||
ResponseEvalResult,
|
||||
EvaluationTiming,
|
||||
EvaluationResult,
|
||||
LIST_RESULTS_RESPONSE,
|
||||
LIST_EVALUATIONS_RESPONSE,
|
||||
TABLE_RESULTS_RESPONSE,
|
||||
TABLE_EVALUATIONS_RESPONSE,
|
||||
)
|
||||
|
||||
|
||||
class TestTextGeneration:
|
||||
def test_valid_request(self):
|
||||
req = GenerateTextRequest(
|
||||
question="test question",
|
||||
profile="A",
|
||||
)
|
||||
assert req.question == "test question"
|
||||
assert req.profile == "A"
|
||||
assert req.label == ""
|
||||
|
||||
def test_label_is_optional(self):
|
||||
req = GenerateTextRequest(
|
||||
question="test",
|
||||
profile="B",
|
||||
label="my label",
|
||||
)
|
||||
assert req.label == "my label"
|
||||
|
||||
def test_invalid_profile_rejected(self):
|
||||
with pytest.raises(ValidationError) as exc_info:
|
||||
GenerateTextRequest(question="test", profile="D")
|
||||
errors = exc_info.value.errors()
|
||||
assert any("profile" in str(e.get("loc", [])) for e in errors)
|
||||
|
||||
def test_empty_question_rejected(self):
|
||||
with pytest.raises(ValidationError):
|
||||
GenerateTextRequest(question="", profile="A")
|
||||
|
||||
|
||||
class TestEvaluatorConfig:
|
||||
def test_valid_evaluator(self):
|
||||
cfg = EvaluatorConfig(
|
||||
model_name="deepseek-v4-pro",
|
||||
base_url="https://api.deepseek.com",
|
||||
api_key_env="DP_API_KEY",
|
||||
enable_thinking=True,
|
||||
)
|
||||
assert cfg.model_name == "deepseek-v4-pro"
|
||||
assert cfg.enable_thinking is True
|
||||
|
||||
def test_thinking_defaults_false(self):
|
||||
cfg = EvaluatorConfig(
|
||||
model_name="test-model",
|
||||
base_url="https://example.com",
|
||||
api_key_env="TEST_KEY",
|
||||
)
|
||||
assert cfg.enable_thinking is False
|
||||
|
||||
|
||||
class TestDimensionScores:
|
||||
def test_valid_scores(self):
|
||||
ds = DimensionScores(
|
||||
dimension_1_準確性=35,
|
||||
dimension_2_完整性=22,
|
||||
dimension_3_清晰度=18,
|
||||
dimension_4_簡潔性=13,
|
||||
)
|
||||
assert ds.dimension_1_準確性 == 35
|
||||
assert ds.dimension_2_完整性 == 22
|
||||
|
||||
def test_準確性_exceeds_max_rejected(self):
|
||||
with pytest.raises(ValidationError):
|
||||
DimensionScores(
|
||||
dimension_1_準確性=41,
|
||||
dimension_2_完整性=22,
|
||||
dimension_3_清晰度=18,
|
||||
dimension_4_簡潔性=13,
|
||||
)
|
||||
|
||||
def test_完整性_exceeds_max_rejected(self):
|
||||
with pytest.raises(ValidationError):
|
||||
DimensionScores(
|
||||
dimension_1_準確性=35,
|
||||
dimension_2_完整性=26,
|
||||
dimension_3_清晰度=18,
|
||||
dimension_4_簡潔性=13,
|
||||
)
|
||||
|
||||
def test_negative_score_rejected(self):
|
||||
with pytest.raises(ValidationError):
|
||||
DimensionScores(
|
||||
dimension_1_準確性=-1,
|
||||
dimension_2_完整性=22,
|
||||
dimension_3_清晰度=18,
|
||||
dimension_4_簡潔性=13,
|
||||
)
|
||||
|
||||
def test_serialization_preserves_chinese_keys(self):
|
||||
ds = DimensionScores(
|
||||
dimension_1_準確性=35,
|
||||
dimension_2_完整性=22,
|
||||
dimension_3_清晰度=18,
|
||||
dimension_4_簡潔性=13,
|
||||
)
|
||||
data = json.loads(ds.model_dump_json())
|
||||
assert data["dimension_1_準確性"] == 35
|
||||
assert data["dimension_3_清晰度"] == 18
|
||||
|
||||
|
||||
class TestEvaluationRequest:
|
||||
def test_with_result_id(self):
|
||||
req = EvaluateRequest(
|
||||
result_id="abc-123",
|
||||
evaluation_config=EvaluationConfigRequest(
|
||||
key_questions_evaluators=[
|
||||
EvaluatorConfig(
|
||||
model_name="deepseek-v4-pro",
|
||||
base_url="https://api.deepseek.com",
|
||||
api_key_env="DP_API_KEY",
|
||||
enable_thinking=True,
|
||||
),
|
||||
EvaluatorConfig(
|
||||
model_name="qwen3-7b-max",
|
||||
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
|
||||
api_key_env="DASHSCOPE_API_KEY",
|
||||
enable_thinking=True,
|
||||
),
|
||||
],
|
||||
chunk_evaluator=EvaluatorConfig(
|
||||
model_name="qwen/qwen3.6-35b-a3b",
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
api_key_env="LLM_API_KEY",
|
||||
enable_thinking=True,
|
||||
),
|
||||
response_evaluator=EvaluatorConfig(
|
||||
model_name="qwen/qwen3.6-35b-a3b",
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
api_key_env="LLM_API_KEY",
|
||||
enable_thinking=True,
|
||||
),
|
||||
),
|
||||
)
|
||||
assert req.result_id == "abc-123"
|
||||
|
||||
def test_without_keys_fails(self):
|
||||
with pytest.raises(ValidationError):
|
||||
EvaluateRequest(
|
||||
evaluation_config=EvaluationConfigRequest(
|
||||
key_questions_evaluators=[],
|
||||
chunk_evaluator=EvaluatorConfig(
|
||||
model_name="test",
|
||||
base_url="https://example.com",
|
||||
api_key_env="TEST_KEY",
|
||||
),
|
||||
response_evaluator=EvaluatorConfig(
|
||||
model_name="test",
|
||||
base_url="https://example.com",
|
||||
api_key_env="TEST_KEY",
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class TestAudioEvalResult:
|
||||
def test_complete_cer_wer(self):
|
||||
aer = AudioEvalResult(
|
||||
cer=0.052,
|
||||
wer=0.083,
|
||||
reference_length=42,
|
||||
transcribed_length=40,
|
||||
substitutions=1,
|
||||
deletions=2,
|
||||
insertions=0,
|
||||
hits=39,
|
||||
)
|
||||
assert aer.cer == 0.052
|
||||
assert aer.status == "completed"
|
||||
|
||||
def test_na_when_no_reference(self):
|
||||
aer = AudioEvalResult(status="na")
|
||||
assert aer.status == "na"
|
||||
assert aer.cer is None
|
||||
assert aer.wer is None
|
||||
|
||||
|
||||
class TestChunkAccuracy:
|
||||
def test_perfect_precision_recall(self):
|
||||
ca = ChunkAccuracy(
|
||||
precision=1.0,
|
||||
recall=1.0,
|
||||
f1=1.0,
|
||||
pipeline_chunks=3,
|
||||
relevant_in_pipeline=3,
|
||||
)
|
||||
assert ca.f1 == 1.0
|
||||
|
||||
def test_zero_precision(self):
|
||||
ca = ChunkAccuracy(
|
||||
precision=0.0,
|
||||
recall=0.8,
|
||||
f1=0.0,
|
||||
pipeline_chunks=5,
|
||||
relevant_in_pipeline=0,
|
||||
)
|
||||
assert ca.precision == 0.0
|
||||
assert ca.f1 == 0.0
|
||||
|
||||
|
||||
class TestEvalResult:
|
||||
def test_completed_with_all_dimensions(self):
|
||||
result = EvaluationResult(
|
||||
evaluation_id="eval-001",
|
||||
result_id="result-001",
|
||||
status="completed",
|
||||
audio_evaluation=AudioEvalResult(
|
||||
cer=0.05,
|
||||
wer=0.08,
|
||||
reference_length=42,
|
||||
transcribed_length=40,
|
||||
substitutions=1,
|
||||
deletions=2,
|
||||
insertions=0,
|
||||
hits=39,
|
||||
),
|
||||
key_questions_evaluation=KeyQuestionsEvalResult(
|
||||
evaluations=[
|
||||
KeyQuestionsEvalEntry(
|
||||
model_name="deepseek-v4-pro",
|
||||
scores=DimensionScores(
|
||||
dimension_1_準確性=35,
|
||||
dimension_2_完整性=22,
|
||||
dimension_3_清晰度=18,
|
||||
dimension_4_簡潔性=13,
|
||||
),
|
||||
total_score=88,
|
||||
max_score=100,
|
||||
comments="good",
|
||||
thinking_trace="...",
|
||||
time_ms=3000,
|
||||
),
|
||||
],
|
||||
average_scores=DimensionScores(
|
||||
dimension_1_準確性=35,
|
||||
dimension_2_完整性=22,
|
||||
dimension_3_清晰度=18,
|
||||
dimension_4_簡潔性=13,
|
||||
),
|
||||
average_total=88.0,
|
||||
),
|
||||
chunk_evaluation=ChunkEvalResult(
|
||||
per_sub_question=[],
|
||||
overall_unfiltered=ChunkAccuracy(
|
||||
precision=0.6, recall=1.0, f1=0.75,
|
||||
pipeline_chunks=5, relevant_in_pipeline=3,
|
||||
),
|
||||
overall_filtered=ChunkAccuracy(
|
||||
precision=1.0, recall=1.0, f1=1.0,
|
||||
pipeline_chunks=3, relevant_in_pipeline=3,
|
||||
),
|
||||
),
|
||||
response_evaluation=ResponseEvalResult(
|
||||
per_sub_question=[],
|
||||
overall_completeness=0.85,
|
||||
overall_factual_accuracy=0.92,
|
||||
),
|
||||
timing=EvaluationTiming(
|
||||
audio_evaluation_time_ms=23,
|
||||
key_questions_evaluation_time_ms=6000,
|
||||
chunk_evaluation_time_ms=14000,
|
||||
response_evaluation_time_ms=7000,
|
||||
total_evaluation_time_ms=27000,
|
||||
),
|
||||
)
|
||||
assert result.status == "completed"
|
||||
assert result.audio_evaluation is not None
|
||||
assert result.key_questions_evaluation is not None
|
||||
|
||||
def test_failed_status(self):
|
||||
result = EvaluationResult(
|
||||
evaluation_id="eval-002",
|
||||
result_id="result-002",
|
||||
status="failed",
|
||||
timing=EvaluationTiming(
|
||||
audio_evaluation_time_ms=10,
|
||||
key_questions_evaluation_time_ms=0,
|
||||
chunk_evaluation_time_ms=0,
|
||||
response_evaluation_time_ms=0,
|
||||
total_evaluation_time_ms=10,
|
||||
),
|
||||
)
|
||||
assert result.status == "failed"
|
||||
|
||||
def test_serialization_roundtrip(self):
|
||||
result = EvaluationResult(
|
||||
evaluation_id="eval-003",
|
||||
result_id="result-003",
|
||||
status="completed",
|
||||
timing=EvaluationTiming(
|
||||
audio_evaluation_time_ms=10,
|
||||
key_questions_evaluation_time_ms=100,
|
||||
chunk_evaluation_time_ms=200,
|
||||
response_evaluation_time_ms=300,
|
||||
total_evaluation_time_ms=610,
|
||||
),
|
||||
)
|
||||
data = json.loads(result.model_dump_json())
|
||||
assert data["evaluation_id"] == "eval-003"
|
||||
assert data["status"] == "completed"
|
||||
# Round-trip deserialization
|
||||
reloaded = EvaluationResult.model_validate(data)
|
||||
assert reloaded.evaluation_id == "eval-003"
|
||||
Loading…
Reference in New Issue