legco_ai_assistant/backend/app/models/testing.py

244 lines
6.3 KiB
Python

from datetime import datetime, timezone
from typing import Any, Dict, List, Literal, Optional
from pydantic import BaseModel, Field, model_validator
from app.models.common import SourceMetadata
# ---------------------------------------------------------------------------
# Request models
# ---------------------------------------------------------------------------
class GenerateTextRequest(BaseModel):
question: str = Field(..., min_length=1)
profile: Literal["A", "B", "C"]
label: str = ""
class EvaluatorConfig(BaseModel):
model_name: str
base_url: str
api_key_env: str
enable_thinking: bool = False
class EvaluationConfigRequest(BaseModel):
key_questions_evaluators: List[EvaluatorConfig]
chunk_evaluator: EvaluatorConfig
response_evaluator: EvaluatorConfig
class EvaluateRequest(BaseModel):
result_id: str = ""
results: Optional["GenerateResult"] = None
evaluation_config: EvaluationConfigRequest
@model_validator(mode="after")
def _require_result_source(self) -> "EvaluateRequest":
if not self.result_id and self.results is None:
raise ValueError("Either result_id or inline results must be provided")
return self
# ---------------------------------------------------------------------------
# Result models (output of generation endpoints)
# ---------------------------------------------------------------------------
class ChunkEntry(BaseModel):
chunk_index: int
text: str
metadata: Dict[str, Any]
distance: float = 0.0
class SubQuestionChunks(BaseModel):
sub_question_index: int
sub_question_text: str
chunks: List[ChunkEntry]
class RetrievalResult(BaseModel):
per_sub_question: List[SubQuestionChunks]
total_chunks_retrieved: int
retriever_time_ms: int
class FilteredResult(BaseModel):
per_sub_question: List[SubQuestionChunks]
total_chunks_filtered: int
filter_time_ms: int
class SubQuestionSources(BaseModel):
sub_question_index: int
sub_question_text: str
sources: List[SourceMetadata]
class ResponseResult(BaseModel):
final_answer: str
sub_question_sources: List[SubQuestionSources]
generate_time_ms: int
class InputInfo(BaseModel):
text: str
reference_transcript: str = ""
audio_filename: str = ""
audio_duration_seconds: float = 0.0
asr_language: str = ""
class TimingInfo(BaseModel):
decomposer_time_ms: int
retriever_time_ms: int
filter_time_ms: int
generator_time_ms: int
total_time_ms: int
asr_time_ms: int = 0
class GenerateResult(BaseModel):
result_id: str
input_type: Literal["text", "audio"]
profile: str
label: str = ""
created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
input: InputInfo
extracted_key_questions: List[str]
retrieval: RetrievalResult
filtered: FilteredResult
response: ResponseResult
timing: TimingInfo
# ---------------------------------------------------------------------------
# Evaluation models
# ---------------------------------------------------------------------------
class DimensionScores(BaseModel):
dimension_1_準確性: float = Field(ge=0, le=40)
dimension_2_完整性: float = Field(ge=0, le=25)
dimension_3_清晰度: float = Field(ge=0, le=20)
dimension_4_簡潔性: float = Field(ge=0, le=15)
class KeyQuestionsEvalEntry(BaseModel):
model_name: str
scores: DimensionScores
total_score: int = Field(ge=0, le=100)
max_score: int = 100
comments: str
thinking_trace: str
time_ms: int
class KeyQuestionsEvalResult(BaseModel):
evaluations: List[KeyQuestionsEvalEntry]
average_scores: DimensionScores
average_total: float
class AudioEvalResult(BaseModel):
status: Literal["completed", "na"] = "completed"
cer: Optional[float] = None
wer: Optional[float] = None
reference_length: int = 0
transcribed_length: int = 0
substitutions: int = 0
deletions: int = 0
insertions: int = 0
hits: int = 0
class ChunkAccuracy(BaseModel):
precision: float
recall: float
f1: float
pipeline_chunks: int
relevant_in_pipeline: int
class GroundTruthInfo(BaseModel):
relevant_documents: List[str]
relevant_chunks: List[Dict[str, Any]]
total_relevant_chunks: int
chunk_evaluation_time_ms: int
class SubQuestionChunkEval(BaseModel):
sub_question_index: int
sub_question_text: str
ground_truth: GroundTruthInfo
unfiltered_accuracy: ChunkAccuracy
filtered_accuracy: ChunkAccuracy
class ChunkEvalResult(BaseModel):
per_sub_question: List[SubQuestionChunkEval]
overall_unfiltered: ChunkAccuracy
overall_filtered: ChunkAccuracy
class SubQuestionResponseEval(BaseModel):
sub_question_index: int
sub_question_text: str
ground_truth_response: str
pipeline_response_section: str
completeness_score: float
factual_accuracy_score: float
comments: str
ground_truth_generation_time_ms: int
comparison_time_ms: int
class ResponseEvalResult(BaseModel):
per_sub_question: List[SubQuestionResponseEval]
overall_completeness: float
overall_factual_accuracy: float
class EvaluationTiming(BaseModel):
audio_evaluation_time_ms: int
key_questions_evaluation_time_ms: int
chunk_evaluation_time_ms: int
response_evaluation_time_ms: int
total_evaluation_time_ms: int
class EvaluationResult(BaseModel):
evaluation_id: str
result_id: str
created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
status: Literal["completed", "partial", "failed"]
audio_evaluation: Optional[AudioEvalResult] = None
key_questions_evaluation: Optional[KeyQuestionsEvalResult] = None
chunk_evaluation: Optional[ChunkEvalResult] = None
response_evaluation: Optional[ResponseEvalResult] = None
timing: EvaluationTiming
# ---------------------------------------------------------------------------
# Listing response models
# ---------------------------------------------------------------------------
class ListItemInfo(BaseModel):
result_id: str
input_type: str
profile: str
label: str
created_at: str
file_size_bytes: int
LIST_RESULTS_RESPONSE = List[ListItemInfo]
LIST_EVALUATIONS_RESPONSE = List[ListItemInfo]
TABLE_RESULTS_RESPONSE = ListItemInfo
TABLE_EVALUATIONS_RESPONSE = ListItemInfo