diff --git a/backend/.env.example b/backend/.env.example index 3788383..ed2f10d 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -64,3 +64,13 @@ QA_MAX_CHUNK_TOKENS=3000 QA_STRUCTURE_MODEL= QA_INCLUDE_INTERNAL_REFS=true QA_CACHE_VISION_RESULTS=true + +# Test results storage (Package 9) +TEST_RESULTS_DIR=./data/test_results +TEST_EVALUATIONS_DIR=./data/test_evaluations + +# Evaluation batching (Package 9) +EVAL_CHUNK_BATCH_SIZE=10 +EVAL_MAX_CONCURRENT_BATCHES=10 +EVAL_BATCH_RETRY_COUNT=2 +EVAL_BATCH_RETRY_DELAY_MS=2000 diff --git a/backend/app/core/config.py b/backend/app/core/config.py index cd34fdc..e78397b 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -52,6 +52,14 @@ class Settings(BaseSettings): qa_include_internal_refs: bool = True qa_cache_vision_results: bool = True + # Accuracy testing & evaluation (Package 9) + test_results_dir: str = "./data/test_results" + test_evaluations_dir: str = "./data/test_evaluations" + eval_chunk_batch_size: int = 10 + eval_max_concurrent_batches: int = 10 + eval_batch_retry_count: int = 2 + eval_batch_retry_delay_ms: int = 2000 + # ASR Configuration (Phase 2 + Phase 5) # Provider: "dashscope" (batch + realtime) or "openrouter" (batch-only) asr_provider: str = "dashscope" diff --git a/backend/app/models/testing.py b/backend/app/models/testing.py new file mode 100644 index 0000000..0b41e54 --- /dev/null +++ b/backend/app/models/testing.py @@ -0,0 +1,243 @@ +from datetime import datetime, timezone +from typing import Any, Dict, List, Literal, Optional + +from pydantic import BaseModel, Field, model_validator + +from app.models.common import SourceMetadata + + +# --------------------------------------------------------------------------- +# Request models +# --------------------------------------------------------------------------- + + +class GenerateTextRequest(BaseModel): + question: str = Field(..., min_length=1) + profile: Literal["A", "B", "C"] + label: str = "" + + +class EvaluatorConfig(BaseModel): + model_name: str + base_url: str + api_key_env: str + enable_thinking: bool = False + + +class EvaluationConfigRequest(BaseModel): + key_questions_evaluators: List[EvaluatorConfig] + chunk_evaluator: EvaluatorConfig + response_evaluator: EvaluatorConfig + + +class EvaluateRequest(BaseModel): + result_id: str = "" + results: Optional["GenerateResult"] = None + evaluation_config: EvaluationConfigRequest + + @model_validator(mode="after") + def _require_result_source(self) -> "EvaluateRequest": + if not self.result_id and self.results is None: + raise ValueError("Either result_id or inline results must be provided") + return self + + +# --------------------------------------------------------------------------- +# Result models (output of generation endpoints) +# --------------------------------------------------------------------------- + + +class ChunkEntry(BaseModel): + chunk_index: int + text: str + metadata: Dict[str, Any] + distance: float = 0.0 + + +class SubQuestionChunks(BaseModel): + sub_question_index: int + sub_question_text: str + chunks: List[ChunkEntry] + + +class RetrievalResult(BaseModel): + per_sub_question: List[SubQuestionChunks] + total_chunks_retrieved: int + retriever_time_ms: int + + +class FilteredResult(BaseModel): + per_sub_question: List[SubQuestionChunks] + total_chunks_filtered: int + filter_time_ms: int + + +class SubQuestionSources(BaseModel): + sub_question_index: int + sub_question_text: str + sources: List[SourceMetadata] + + +class ResponseResult(BaseModel): + final_answer: str + sub_question_sources: List[SubQuestionSources] + generate_time_ms: int + + +class InputInfo(BaseModel): + text: str + reference_transcript: str = "" + audio_filename: str = "" + audio_duration_seconds: float = 0.0 + asr_language: str = "" + + +class TimingInfo(BaseModel): + decomposer_time_ms: int + retriever_time_ms: int + filter_time_ms: int + generator_time_ms: int + total_time_ms: int + asr_time_ms: int = 0 + + +class GenerateResult(BaseModel): + result_id: str + input_type: Literal["text", "audio"] + profile: str + label: str = "" + created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).isoformat()) + input: InputInfo + extracted_key_questions: List[str] + retrieval: RetrievalResult + filtered: FilteredResult + response: ResponseResult + timing: TimingInfo + + +# --------------------------------------------------------------------------- +# Evaluation models +# --------------------------------------------------------------------------- + + +class DimensionScores(BaseModel): + dimension_1_準確性: int = Field(ge=0, le=40) + dimension_2_完整性: int = Field(ge=0, le=25) + dimension_3_清晰度: int = Field(ge=0, le=20) + dimension_4_簡潔性: int = Field(ge=0, le=15) + + +class KeyQuestionsEvalEntry(BaseModel): + model_name: str + scores: DimensionScores + total_score: int = Field(ge=0, le=100) + max_score: int = 100 + comments: str + thinking_trace: str + time_ms: int + + +class KeyQuestionsEvalResult(BaseModel): + evaluations: List[KeyQuestionsEvalEntry] + average_scores: DimensionScores + average_total: float + + +class AudioEvalResult(BaseModel): + status: Literal["completed", "na"] = "completed" + cer: Optional[float] = None + wer: Optional[float] = None + reference_length: int = 0 + transcribed_length: int = 0 + substitutions: int = 0 + deletions: int = 0 + insertions: int = 0 + hits: int = 0 + + +class ChunkAccuracy(BaseModel): + precision: float + recall: float + f1: float + pipeline_chunks: int + relevant_in_pipeline: int + + +class GroundTruthInfo(BaseModel): + relevant_documents: List[str] + relevant_chunks: List[Dict[str, Any]] + total_relevant_chunks: int + chunk_evaluation_time_ms: int + + +class SubQuestionChunkEval(BaseModel): + sub_question_index: int + sub_question_text: str + ground_truth: GroundTruthInfo + unfiltered_accuracy: ChunkAccuracy + filtered_accuracy: ChunkAccuracy + + +class ChunkEvalResult(BaseModel): + per_sub_question: List[SubQuestionChunkEval] + overall_unfiltered: ChunkAccuracy + overall_filtered: ChunkAccuracy + + +class SubQuestionResponseEval(BaseModel): + sub_question_index: int + sub_question_text: str + ground_truth_response: str + pipeline_response_section: str + completeness_score: float + factual_accuracy_score: float + comments: str + ground_truth_generation_time_ms: int + comparison_time_ms: int + + +class ResponseEvalResult(BaseModel): + per_sub_question: List[SubQuestionResponseEval] + overall_completeness: float + overall_factual_accuracy: float + + +class EvaluationTiming(BaseModel): + audio_evaluation_time_ms: int + key_questions_evaluation_time_ms: int + chunk_evaluation_time_ms: int + response_evaluation_time_ms: int + total_evaluation_time_ms: int + + +class EvaluationResult(BaseModel): + evaluation_id: str + result_id: str + created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).isoformat()) + status: Literal["completed", "partial", "failed"] + audio_evaluation: Optional[AudioEvalResult] = None + key_questions_evaluation: Optional[KeyQuestionsEvalResult] = None + chunk_evaluation: Optional[ChunkEvalResult] = None + response_evaluation: Optional[ResponseEvalResult] = None + timing: EvaluationTiming + + +# --------------------------------------------------------------------------- +# Listing response models +# --------------------------------------------------------------------------- + + +class ListItemInfo(BaseModel): + result_id: str + input_type: str + profile: str + label: str + created_at: str + file_size_bytes: int + + +LIST_RESULTS_RESPONSE = List[ListItemInfo] +LIST_EVALUATIONS_RESPONSE = List[ListItemInfo] + +TABLE_RESULTS_RESPONSE = ListItemInfo +TABLE_EVALUATIONS_RESPONSE = ListItemInfo diff --git a/backend/app/test/test_phase9_config.py b/backend/app/test/test_phase9_config.py new file mode 100644 index 0000000..16b4721 --- /dev/null +++ b/backend/app/test/test_phase9_config.py @@ -0,0 +1,74 @@ +"""Phase 9 tests: Config settings for accuracy testing & evaluation (Sub-Phase 9.0). + +Covers: +- test_results_dir and test_evaluations_dir default values +- eval_chunk_batch_size = 10 (fixed) +- eval_max_concurrent_batches = 10 (rate limiting) +- eval_batch_retry_count = 2 +- eval_batch_retry_delay_ms = 2000 +- Env var overrides work correctly +""" + +from app.core.config import Settings + + +class TestEvalConfigDefaults: + """Default values for new Package 9 evaluation settings.""" + + def test_default_results_dir(self): + s = Settings() + assert s.test_results_dir == "./data/test_results" + + def test_default_evaluations_dir(self): + s = Settings() + assert s.test_evaluations_dir == "./data/test_evaluations" + + def test_default_chunk_batch_size(self): + s = Settings() + assert s.eval_chunk_batch_size == 10 + + def test_default_max_concurrent_batches(self): + s = Settings() + assert s.eval_max_concurrent_batches == 10 + + def test_default_batch_retry_count(self): + s = Settings() + assert s.eval_batch_retry_count == 2 + + def test_default_batch_retry_delay_ms(self): + s = Settings() + assert s.eval_batch_retry_delay_ms == 2000 + + +class TestEvalConfigEnvOverrides: + """Environment variable overrides for evaluation settings.""" + + def test_results_dir_from_env(self, monkeypatch): + monkeypatch.setenv("TEST_RESULTS_DIR", "/tmp/test_results") + s = Settings() + assert s.test_results_dir == "/tmp/test_results" + + def test_evaluations_dir_from_env(self, monkeypatch): + monkeypatch.setenv("TEST_EVALUATIONS_DIR", "/tmp/test_eval") + s = Settings() + assert s.test_evaluations_dir == "/tmp/test_eval" + + def test_chunk_batch_size_from_env(self, monkeypatch): + monkeypatch.setenv("EVAL_CHUNK_BATCH_SIZE", "5") + s = Settings() + assert s.eval_chunk_batch_size == 5 + + def test_max_concurrent_batches_from_env(self, monkeypatch): + monkeypatch.setenv("EVAL_MAX_CONCURRENT_BATCHES", "5") + s = Settings() + assert s.eval_max_concurrent_batches == 5 + + def test_batch_retry_count_from_env(self, monkeypatch): + monkeypatch.setenv("EVAL_BATCH_RETRY_COUNT", "3") + s = Settings() + assert s.eval_batch_retry_count == 3 + + def test_batch_retry_delay_ms_from_env(self, monkeypatch): + monkeypatch.setenv("EVAL_BATCH_RETRY_DELAY_MS", "5000") + s = Settings() + assert s.eval_batch_retry_delay_ms == 5000 diff --git a/backend/app/test/test_phase9_models.py b/backend/app/test/test_phase9_models.py new file mode 100644 index 0000000..cb6173e --- /dev/null +++ b/backend/app/test/test_phase9_models.py @@ -0,0 +1,351 @@ +"""Phase 9 tests: Pydantic models for accuracy testing & evaluation (Sub-Phase 9.0). + +Covers: +- GenerateTextRequest validation +- EvaluateRequest validation (result_id vs inline results) +- EvaluatorConfig model +- Key questions scoring model (dimension_score_1 through dimension_score_4 with correct ranges) +- EvaluationResult with all four evaluation types +- Chunk evaluation models (precision/recall/F1 comparison) +- JSON serialization round-trip +""" +import json +from typing import Optional + +import pytest +from pydantic import ValidationError + +from app.models.testing import ( + GenerateTextRequest, + EvaluateRequest, + EvaluatorConfig, + EvaluationConfigRequest, + DimensionScores, + GenerateResult, + InputInfo, + TimingInfo, + RetrievalResult, + FilteredResult, + ResponseResult, + SubQuestionChunks, + SubQuestionSources, + ChunkEntry, + AudioEvalResult, + KeyQuestionsEvalEntry, + KeyQuestionsEvalResult, + ChunkAccuracy, + GroundTruthInfo, + SubQuestionChunkEval, + ChunkEvalResult, + SubQuestionResponseEval, + ResponseEvalResult, + EvaluationTiming, + EvaluationResult, + LIST_RESULTS_RESPONSE, + LIST_EVALUATIONS_RESPONSE, + TABLE_RESULTS_RESPONSE, + TABLE_EVALUATIONS_RESPONSE, +) + + +class TestTextGeneration: + def test_valid_request(self): + req = GenerateTextRequest( + question="test question", + profile="A", + ) + assert req.question == "test question" + assert req.profile == "A" + assert req.label == "" + + def test_label_is_optional(self): + req = GenerateTextRequest( + question="test", + profile="B", + label="my label", + ) + assert req.label == "my label" + + def test_invalid_profile_rejected(self): + with pytest.raises(ValidationError) as exc_info: + GenerateTextRequest(question="test", profile="D") + errors = exc_info.value.errors() + assert any("profile" in str(e.get("loc", [])) for e in errors) + + def test_empty_question_rejected(self): + with pytest.raises(ValidationError): + GenerateTextRequest(question="", profile="A") + + +class TestEvaluatorConfig: + def test_valid_evaluator(self): + cfg = EvaluatorConfig( + model_name="deepseek-v4-pro", + base_url="https://api.deepseek.com", + api_key_env="DP_API_KEY", + enable_thinking=True, + ) + assert cfg.model_name == "deepseek-v4-pro" + assert cfg.enable_thinking is True + + def test_thinking_defaults_false(self): + cfg = EvaluatorConfig( + model_name="test-model", + base_url="https://example.com", + api_key_env="TEST_KEY", + ) + assert cfg.enable_thinking is False + + +class TestDimensionScores: + def test_valid_scores(self): + ds = DimensionScores( + dimension_1_準確性=35, + dimension_2_完整性=22, + dimension_3_清晰度=18, + dimension_4_簡潔性=13, + ) + assert ds.dimension_1_準確性 == 35 + assert ds.dimension_2_完整性 == 22 + + def test_準確性_exceeds_max_rejected(self): + with pytest.raises(ValidationError): + DimensionScores( + dimension_1_準確性=41, + dimension_2_完整性=22, + dimension_3_清晰度=18, + dimension_4_簡潔性=13, + ) + + def test_完整性_exceeds_max_rejected(self): + with pytest.raises(ValidationError): + DimensionScores( + dimension_1_準確性=35, + dimension_2_完整性=26, + dimension_3_清晰度=18, + dimension_4_簡潔性=13, + ) + + def test_negative_score_rejected(self): + with pytest.raises(ValidationError): + DimensionScores( + dimension_1_準確性=-1, + dimension_2_完整性=22, + dimension_3_清晰度=18, + dimension_4_簡潔性=13, + ) + + def test_serialization_preserves_chinese_keys(self): + ds = DimensionScores( + dimension_1_準確性=35, + dimension_2_完整性=22, + dimension_3_清晰度=18, + dimension_4_簡潔性=13, + ) + data = json.loads(ds.model_dump_json()) + assert data["dimension_1_準確性"] == 35 + assert data["dimension_3_清晰度"] == 18 + + +class TestEvaluationRequest: + def test_with_result_id(self): + req = EvaluateRequest( + result_id="abc-123", + evaluation_config=EvaluationConfigRequest( + key_questions_evaluators=[ + EvaluatorConfig( + model_name="deepseek-v4-pro", + base_url="https://api.deepseek.com", + api_key_env="DP_API_KEY", + enable_thinking=True, + ), + EvaluatorConfig( + model_name="qwen3-7b-max", + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", + api_key_env="DASHSCOPE_API_KEY", + enable_thinking=True, + ), + ], + chunk_evaluator=EvaluatorConfig( + model_name="qwen/qwen3.6-35b-a3b", + base_url="https://openrouter.ai/api/v1", + api_key_env="LLM_API_KEY", + enable_thinking=True, + ), + response_evaluator=EvaluatorConfig( + model_name="qwen/qwen3.6-35b-a3b", + base_url="https://openrouter.ai/api/v1", + api_key_env="LLM_API_KEY", + enable_thinking=True, + ), + ), + ) + assert req.result_id == "abc-123" + + def test_without_keys_fails(self): + with pytest.raises(ValidationError): + EvaluateRequest( + evaluation_config=EvaluationConfigRequest( + key_questions_evaluators=[], + chunk_evaluator=EvaluatorConfig( + model_name="test", + base_url="https://example.com", + api_key_env="TEST_KEY", + ), + response_evaluator=EvaluatorConfig( + model_name="test", + base_url="https://example.com", + api_key_env="TEST_KEY", + ), + ), + ) + + +class TestAudioEvalResult: + def test_complete_cer_wer(self): + aer = AudioEvalResult( + cer=0.052, + wer=0.083, + reference_length=42, + transcribed_length=40, + substitutions=1, + deletions=2, + insertions=0, + hits=39, + ) + assert aer.cer == 0.052 + assert aer.status == "completed" + + def test_na_when_no_reference(self): + aer = AudioEvalResult(status="na") + assert aer.status == "na" + assert aer.cer is None + assert aer.wer is None + + +class TestChunkAccuracy: + def test_perfect_precision_recall(self): + ca = ChunkAccuracy( + precision=1.0, + recall=1.0, + f1=1.0, + pipeline_chunks=3, + relevant_in_pipeline=3, + ) + assert ca.f1 == 1.0 + + def test_zero_precision(self): + ca = ChunkAccuracy( + precision=0.0, + recall=0.8, + f1=0.0, + pipeline_chunks=5, + relevant_in_pipeline=0, + ) + assert ca.precision == 0.0 + assert ca.f1 == 0.0 + + +class TestEvalResult: + def test_completed_with_all_dimensions(self): + result = EvaluationResult( + evaluation_id="eval-001", + result_id="result-001", + status="completed", + audio_evaluation=AudioEvalResult( + cer=0.05, + wer=0.08, + reference_length=42, + transcribed_length=40, + substitutions=1, + deletions=2, + insertions=0, + hits=39, + ), + key_questions_evaluation=KeyQuestionsEvalResult( + evaluations=[ + KeyQuestionsEvalEntry( + model_name="deepseek-v4-pro", + scores=DimensionScores( + dimension_1_準確性=35, + dimension_2_完整性=22, + dimension_3_清晰度=18, + dimension_4_簡潔性=13, + ), + total_score=88, + max_score=100, + comments="good", + thinking_trace="...", + time_ms=3000, + ), + ], + average_scores=DimensionScores( + dimension_1_準確性=35, + dimension_2_完整性=22, + dimension_3_清晰度=18, + dimension_4_簡潔性=13, + ), + average_total=88.0, + ), + chunk_evaluation=ChunkEvalResult( + per_sub_question=[], + overall_unfiltered=ChunkAccuracy( + precision=0.6, recall=1.0, f1=0.75, + pipeline_chunks=5, relevant_in_pipeline=3, + ), + overall_filtered=ChunkAccuracy( + precision=1.0, recall=1.0, f1=1.0, + pipeline_chunks=3, relevant_in_pipeline=3, + ), + ), + response_evaluation=ResponseEvalResult( + per_sub_question=[], + overall_completeness=0.85, + overall_factual_accuracy=0.92, + ), + timing=EvaluationTiming( + audio_evaluation_time_ms=23, + key_questions_evaluation_time_ms=6000, + chunk_evaluation_time_ms=14000, + response_evaluation_time_ms=7000, + total_evaluation_time_ms=27000, + ), + ) + assert result.status == "completed" + assert result.audio_evaluation is not None + assert result.key_questions_evaluation is not None + + def test_failed_status(self): + result = EvaluationResult( + evaluation_id="eval-002", + result_id="result-002", + status="failed", + timing=EvaluationTiming( + audio_evaluation_time_ms=10, + key_questions_evaluation_time_ms=0, + chunk_evaluation_time_ms=0, + response_evaluation_time_ms=0, + total_evaluation_time_ms=10, + ), + ) + assert result.status == "failed" + + def test_serialization_roundtrip(self): + result = EvaluationResult( + evaluation_id="eval-003", + result_id="result-003", + status="completed", + timing=EvaluationTiming( + audio_evaluation_time_ms=10, + key_questions_evaluation_time_ms=100, + chunk_evaluation_time_ms=200, + response_evaluation_time_ms=300, + total_evaluation_time_ms=610, + ), + ) + data = json.loads(result.model_dump_json()) + assert data["evaluation_id"] == "eval-003" + assert data["status"] == "completed" + # Round-trip deserialization + reloaded = EvaluationResult.model_validate(data) + assert reloaded.evaluation_id == "eval-003"