from datetime import datetime, timezone from typing import Any, Dict, List, Literal, Optional from pydantic import BaseModel, Field, model_validator from app.models.common import SourceMetadata # --------------------------------------------------------------------------- # Request models # --------------------------------------------------------------------------- class GenerateTextRequest(BaseModel): question: str = Field(..., min_length=1) profile: Literal["A", "B", "C"] label: str = "" class EvaluatorConfig(BaseModel): model_name: str base_url: str api_key_env: str enable_thinking: bool = False class EvaluationConfigRequest(BaseModel): key_questions_evaluators: List[EvaluatorConfig] chunk_evaluator: EvaluatorConfig response_evaluator: EvaluatorConfig class EvaluateRequest(BaseModel): result_id: str = "" results: Optional["GenerateResult"] = None evaluation_config: EvaluationConfigRequest @model_validator(mode="after") def _require_result_source(self) -> "EvaluateRequest": if not self.result_id and self.results is None: raise ValueError("Either result_id or inline results must be provided") return self # --------------------------------------------------------------------------- # Result models (output of generation endpoints) # --------------------------------------------------------------------------- class ChunkEntry(BaseModel): chunk_index: int text: str metadata: Dict[str, Any] distance: float = 0.0 class SubQuestionChunks(BaseModel): sub_question_index: int sub_question_text: str chunks: List[ChunkEntry] class RetrievalResult(BaseModel): per_sub_question: List[SubQuestionChunks] total_chunks_retrieved: int retriever_time_ms: int class FilteredResult(BaseModel): per_sub_question: List[SubQuestionChunks] total_chunks_filtered: int filter_time_ms: int class SubQuestionSources(BaseModel): sub_question_index: int sub_question_text: str sources: List[SourceMetadata] class ResponseResult(BaseModel): final_answer: str sub_question_sources: List[SubQuestionSources] generate_time_ms: int class InputInfo(BaseModel): text: str reference_transcript: str = "" audio_filename: str = "" audio_duration_seconds: float = 0.0 asr_language: str = "" class TimingInfo(BaseModel): decomposer_time_ms: int retriever_time_ms: int filter_time_ms: int generator_time_ms: int total_time_ms: int asr_time_ms: int = 0 class GenerateResult(BaseModel): result_id: str input_type: Literal["text", "audio"] profile: str label: str = "" created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).isoformat()) input: InputInfo extracted_key_questions: List[str] retrieval: RetrievalResult filtered: FilteredResult response: ResponseResult timing: TimingInfo # --------------------------------------------------------------------------- # Evaluation models # --------------------------------------------------------------------------- class DimensionScores(BaseModel): dimension_1_準確性: int = Field(ge=0, le=40) dimension_2_完整性: int = Field(ge=0, le=25) dimension_3_清晰度: int = Field(ge=0, le=20) dimension_4_簡潔性: int = Field(ge=0, le=15) class KeyQuestionsEvalEntry(BaseModel): model_name: str scores: DimensionScores total_score: int = Field(ge=0, le=100) max_score: int = 100 comments: str thinking_trace: str time_ms: int class KeyQuestionsEvalResult(BaseModel): evaluations: List[KeyQuestionsEvalEntry] average_scores: DimensionScores average_total: float class AudioEvalResult(BaseModel): status: Literal["completed", "na"] = "completed" cer: Optional[float] = None wer: Optional[float] = None reference_length: int = 0 transcribed_length: int = 0 substitutions: int = 0 deletions: int = 0 insertions: int = 0 hits: int = 0 class ChunkAccuracy(BaseModel): precision: float recall: float f1: float pipeline_chunks: int relevant_in_pipeline: int class GroundTruthInfo(BaseModel): relevant_documents: List[str] relevant_chunks: List[Dict[str, Any]] total_relevant_chunks: int chunk_evaluation_time_ms: int class SubQuestionChunkEval(BaseModel): sub_question_index: int sub_question_text: str ground_truth: GroundTruthInfo unfiltered_accuracy: ChunkAccuracy filtered_accuracy: ChunkAccuracy class ChunkEvalResult(BaseModel): per_sub_question: List[SubQuestionChunkEval] overall_unfiltered: ChunkAccuracy overall_filtered: ChunkAccuracy class SubQuestionResponseEval(BaseModel): sub_question_index: int sub_question_text: str ground_truth_response: str pipeline_response_section: str completeness_score: float factual_accuracy_score: float comments: str ground_truth_generation_time_ms: int comparison_time_ms: int class ResponseEvalResult(BaseModel): per_sub_question: List[SubQuestionResponseEval] overall_completeness: float overall_factual_accuracy: float class EvaluationTiming(BaseModel): audio_evaluation_time_ms: int key_questions_evaluation_time_ms: int chunk_evaluation_time_ms: int response_evaluation_time_ms: int total_evaluation_time_ms: int class EvaluationResult(BaseModel): evaluation_id: str result_id: str created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).isoformat()) status: Literal["completed", "partial", "failed"] audio_evaluation: Optional[AudioEvalResult] = None key_questions_evaluation: Optional[KeyQuestionsEvalResult] = None chunk_evaluation: Optional[ChunkEvalResult] = None response_evaluation: Optional[ResponseEvalResult] = None timing: EvaluationTiming # --------------------------------------------------------------------------- # Listing response models # --------------------------------------------------------------------------- class ListItemInfo(BaseModel): result_id: str input_type: str profile: str label: str created_at: str file_size_bytes: int LIST_RESULTS_RESPONSE = List[ListItemInfo] LIST_EVALUATIONS_RESPONSE = List[ListItemInfo] TABLE_RESULTS_RESPONSE = ListItemInfo TABLE_EVALUATIONS_RESPONSE = ListItemInfo