244 lines
6.3 KiB
Python
244 lines
6.3 KiB
Python
from datetime import datetime, timezone
|
|
from typing import Any, Dict, List, Literal, Optional
|
|
|
|
from pydantic import BaseModel, Field, model_validator
|
|
|
|
from app.models.common import SourceMetadata
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Request models
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class GenerateTextRequest(BaseModel):
|
|
question: str = Field(..., min_length=1)
|
|
profile: Literal["A", "B", "C"]
|
|
label: str = ""
|
|
|
|
|
|
class EvaluatorConfig(BaseModel):
|
|
model_name: str
|
|
base_url: str
|
|
api_key_env: str
|
|
enable_thinking: bool = False
|
|
|
|
|
|
class EvaluationConfigRequest(BaseModel):
|
|
key_questions_evaluators: List[EvaluatorConfig]
|
|
chunk_evaluator: EvaluatorConfig
|
|
response_evaluator: EvaluatorConfig
|
|
|
|
|
|
class EvaluateRequest(BaseModel):
|
|
result_id: str = ""
|
|
results: Optional["GenerateResult"] = None
|
|
evaluation_config: EvaluationConfigRequest
|
|
|
|
@model_validator(mode="after")
|
|
def _require_result_source(self) -> "EvaluateRequest":
|
|
if not self.result_id and self.results is None:
|
|
raise ValueError("Either result_id or inline results must be provided")
|
|
return self
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Result models (output of generation endpoints)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class ChunkEntry(BaseModel):
|
|
chunk_index: int
|
|
text: str
|
|
metadata: Dict[str, Any]
|
|
distance: float = 0.0
|
|
|
|
|
|
class SubQuestionChunks(BaseModel):
|
|
sub_question_index: int
|
|
sub_question_text: str
|
|
chunks: List[ChunkEntry]
|
|
|
|
|
|
class RetrievalResult(BaseModel):
|
|
per_sub_question: List[SubQuestionChunks]
|
|
total_chunks_retrieved: int
|
|
retriever_time_ms: int
|
|
|
|
|
|
class FilteredResult(BaseModel):
|
|
per_sub_question: List[SubQuestionChunks]
|
|
total_chunks_filtered: int
|
|
filter_time_ms: int
|
|
|
|
|
|
class SubQuestionSources(BaseModel):
|
|
sub_question_index: int
|
|
sub_question_text: str
|
|
sources: List[SourceMetadata]
|
|
|
|
|
|
class ResponseResult(BaseModel):
|
|
final_answer: str
|
|
sub_question_sources: List[SubQuestionSources]
|
|
generate_time_ms: int
|
|
|
|
|
|
class InputInfo(BaseModel):
|
|
text: str
|
|
reference_transcript: str = ""
|
|
audio_filename: str = ""
|
|
audio_duration_seconds: float = 0.0
|
|
asr_language: str = ""
|
|
|
|
|
|
class TimingInfo(BaseModel):
|
|
decomposer_time_ms: int
|
|
retriever_time_ms: int
|
|
filter_time_ms: int
|
|
generator_time_ms: int
|
|
total_time_ms: int
|
|
asr_time_ms: int = 0
|
|
|
|
|
|
class GenerateResult(BaseModel):
|
|
result_id: str
|
|
input_type: Literal["text", "audio"]
|
|
profile: str
|
|
label: str = ""
|
|
created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
|
input: InputInfo
|
|
extracted_key_questions: List[str]
|
|
retrieval: RetrievalResult
|
|
filtered: FilteredResult
|
|
response: ResponseResult
|
|
timing: TimingInfo
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Evaluation models
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class DimensionScores(BaseModel):
|
|
dimension_1_準確性: int = Field(ge=0, le=40)
|
|
dimension_2_完整性: int = Field(ge=0, le=25)
|
|
dimension_3_清晰度: int = Field(ge=0, le=20)
|
|
dimension_4_簡潔性: int = Field(ge=0, le=15)
|
|
|
|
|
|
class KeyQuestionsEvalEntry(BaseModel):
|
|
model_name: str
|
|
scores: DimensionScores
|
|
total_score: int = Field(ge=0, le=100)
|
|
max_score: int = 100
|
|
comments: str
|
|
thinking_trace: str
|
|
time_ms: int
|
|
|
|
|
|
class KeyQuestionsEvalResult(BaseModel):
|
|
evaluations: List[KeyQuestionsEvalEntry]
|
|
average_scores: DimensionScores
|
|
average_total: float
|
|
|
|
|
|
class AudioEvalResult(BaseModel):
|
|
status: Literal["completed", "na"] = "completed"
|
|
cer: Optional[float] = None
|
|
wer: Optional[float] = None
|
|
reference_length: int = 0
|
|
transcribed_length: int = 0
|
|
substitutions: int = 0
|
|
deletions: int = 0
|
|
insertions: int = 0
|
|
hits: int = 0
|
|
|
|
|
|
class ChunkAccuracy(BaseModel):
|
|
precision: float
|
|
recall: float
|
|
f1: float
|
|
pipeline_chunks: int
|
|
relevant_in_pipeline: int
|
|
|
|
|
|
class GroundTruthInfo(BaseModel):
|
|
relevant_documents: List[str]
|
|
relevant_chunks: List[Dict[str, Any]]
|
|
total_relevant_chunks: int
|
|
chunk_evaluation_time_ms: int
|
|
|
|
|
|
class SubQuestionChunkEval(BaseModel):
|
|
sub_question_index: int
|
|
sub_question_text: str
|
|
ground_truth: GroundTruthInfo
|
|
unfiltered_accuracy: ChunkAccuracy
|
|
filtered_accuracy: ChunkAccuracy
|
|
|
|
|
|
class ChunkEvalResult(BaseModel):
|
|
per_sub_question: List[SubQuestionChunkEval]
|
|
overall_unfiltered: ChunkAccuracy
|
|
overall_filtered: ChunkAccuracy
|
|
|
|
|
|
class SubQuestionResponseEval(BaseModel):
|
|
sub_question_index: int
|
|
sub_question_text: str
|
|
ground_truth_response: str
|
|
pipeline_response_section: str
|
|
completeness_score: float
|
|
factual_accuracy_score: float
|
|
comments: str
|
|
ground_truth_generation_time_ms: int
|
|
comparison_time_ms: int
|
|
|
|
|
|
class ResponseEvalResult(BaseModel):
|
|
per_sub_question: List[SubQuestionResponseEval]
|
|
overall_completeness: float
|
|
overall_factual_accuracy: float
|
|
|
|
|
|
class EvaluationTiming(BaseModel):
|
|
audio_evaluation_time_ms: int
|
|
key_questions_evaluation_time_ms: int
|
|
chunk_evaluation_time_ms: int
|
|
response_evaluation_time_ms: int
|
|
total_evaluation_time_ms: int
|
|
|
|
|
|
class EvaluationResult(BaseModel):
|
|
evaluation_id: str
|
|
result_id: str
|
|
created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
|
status: Literal["completed", "partial", "failed"]
|
|
audio_evaluation: Optional[AudioEvalResult] = None
|
|
key_questions_evaluation: Optional[KeyQuestionsEvalResult] = None
|
|
chunk_evaluation: Optional[ChunkEvalResult] = None
|
|
response_evaluation: Optional[ResponseEvalResult] = None
|
|
timing: EvaluationTiming
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Listing response models
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class ListItemInfo(BaseModel):
|
|
result_id: str
|
|
input_type: str
|
|
profile: str
|
|
label: str
|
|
created_at: str
|
|
file_size_bytes: int
|
|
|
|
|
|
LIST_RESULTS_RESPONSE = List[ListItemInfo]
|
|
LIST_EVALUATIONS_RESPONSE = List[ListItemInfo]
|
|
|
|
TABLE_RESULTS_RESPONSE = ListItemInfo
|
|
TABLE_EVALUATIONS_RESPONSE = ListItemInfo
|