import asyncio import json import logging import os import time from typing import List, Optional from app.models.testing import ( DimensionScores, EvaluatorConfig, KeyQuestionsEvalEntry, KeyQuestionsEvalResult, ) from app.services.llm_client import LLMClient logger = logging.getLogger(__name__) MAX_RETRIES = 3 RETRY_DELAYS = [2.0, 4.0, 8.0] _MARKING_SCHEME_PROMPT = """你正在評估從文件中提取的關鍵問題的質量。 原文/轉錄文本: {original_text} 提取的關鍵問題: {extracted_questions} 請根據以下評分標準評估這些關鍵問題的質量: | 維度 | 權重 | 滿分描述 | 扣分指引 | |------|------|---------|---------| | 1. 準確性 (Fidelity to Original) | 40分 | 完全忠於原發言的核心意思、數字、關鍵詞及邏輯,沒有扭曲、遺漏或添加原意沒有的內容。 | 意思走樣(如把「先後緩急」改成其他概念)→ 扣 10–20 分;數字錯誤或遺漏(如 1065 戶、889 戶)→ 扣 15–25 分;完全偏離原意 → 扣 30–40 分 | | 2. 完整性 (Completeness) | 25分 | 涵蓋原發言中該部分的所有關鍵元素(問題 + 背景 + 目的),無明顯遺漏。 | 漏掉重要背景(如「當前財政緊張」)→ 扣 8–12 分;只問一半(例如只問「可否先處理主幹道」,漏掉「後處理單車徑」)→ 扣 10–18 分;完全只剩一句問句 → 扣 20 分以上 | | 3. 清晰度 (Clarity) | 20分 | 語言精準、邏輯清楚、易讀易懂,問題焦點一目了然,適合正式會議場合使用。 | 句子過長或結構混亂 → 扣 6–10 分;出現歧義或模糊詞 → 扣 10–15 分;完全看不懂重點 → 扣 16–20 分 | | 4. 簡潔性 (Conciseness) | 15分 | 用最少的字數表達最完整的意思,無多餘贅詞,適合口頭提問或書面記錄。 | 過於冗長(比原發言還長)→ 扣 6–10 分;過度簡化導致意思不全 → 扣 8–13 分 | 請返回JSON格式,包含以下字段: - dimension_1_準確性: 整數 (0-40) - dimension_2_完整性: 整數 (0-25) - dimension_3_清晰度: 整數 (0-20) - dimension_4_簡潔性: 整數 (0-15) - comments: 簡要評語 """ def _build_eval_prompt(original_text: str, extracted_questions: List[str]) -> str: questions_str = "\n".join( f" {i + 1}. {q}" for i, q in enumerate(extracted_questions) ) return _MARKING_SCHEME_PROMPT.format( original_text=original_text, extracted_questions=questions_str, ) def _parse_score_response(raw: str, model_name: str) -> Optional[dict]: try: data = json.loads(raw) except json.JSONDecodeError: logger.warning("Evaluator %s returned invalid JSON: %.200s", model_name, raw) return None required = [ "dimension_1_準確性", "dimension_2_完整性", "dimension_3_清晰度", "dimension_4_簡潔性", ] if not all(k in data for k in required): logger.warning("Evaluator %s missing required keys: %s", model_name, set(required) - set(data.keys())) return None return data async def _run_single_evaluator( config: EvaluatorConfig, prompt: str, model_idx: int, ) -> Optional[dict]: api_key = os.environ.get(config.api_key_env, "") if not api_key: logger.error("API key not found for env var: %s", config.api_key_env) return None client = LLMClient.__new__(LLMClient) client.settings = type("_Settings", (), {"vllm_engine": False, "llm_enable_thinking": config.enable_thinking})() client.model = config.model_name client.enable_thinking = config.enable_thinking client.logger = logging.getLogger(f"{__name__}.evaluator_{model_idx}") import httpx from openai import AsyncOpenAI client._client = AsyncOpenAI( base_url=config.base_url.rstrip("/"), api_key=api_key, timeout=60.0, http_client=httpx.AsyncClient( headers={"Content-Type": "application/json"}, ), ) client._langchain_model = None for attempt in range(MAX_RETRIES): try: step_name = f"Eval-{config.model_name}" start = time.perf_counter() raw = await client.complete( prompt=prompt, temperature=0.3, step_name=step_name, ) elapsed_ms = int((time.perf_counter() - start) * 1000) parsed = _parse_score_response(raw, config.model_name) if parsed is not None: scores = DimensionScores( dimension_1_準確性=int(parsed["dimension_1_準確性"]), dimension_2_完整性=int(parsed["dimension_2_完整性"]), dimension_3_清晰度=int(parsed["dimension_3_清晰度"]), dimension_4_簡潔性=int(parsed["dimension_4_簡潔性"]), ) total = ( scores.dimension_1_準確性 + scores.dimension_2_完整性 + scores.dimension_3_清晰度 + scores.dimension_4_簡潔性 ) return KeyQuestionsEvalEntry( model_name=config.model_name, scores=scores, total_score=total, max_score=100, comments=parsed.get("comments", ""), thinking_trace="", time_ms=elapsed_ms, ).model_dump() except Exception as exc: logger.warning( "Evaluator %s attempt %d/%d failed: %s", config.model_name, attempt + 1, MAX_RETRIES, exc, ) if attempt < MAX_RETRIES - 1: await asyncio.sleep(RETRY_DELAYS[attempt]) return None async def evaluate_key_questions( original_text: str, extracted_questions: List[str], evaluator_configs: List[EvaluatorConfig], ) -> KeyQuestionsEvalResult: if not evaluator_configs: return KeyQuestionsEvalResult( evaluations=[], average_scores=DimensionScores( dimension_1_準確性=0, dimension_2_完整性=0, dimension_3_清晰度=0, dimension_4_簡潔性=0, ), average_total=0.0, ) prompt = _build_eval_prompt(original_text, extracted_questions) results_raw = await asyncio.gather( *[ _run_single_evaluator(cfg, prompt, i) for i, cfg in enumerate(evaluator_configs) ] ) evaluations = [] for r in results_raw: if r is not None: evaluations.append(KeyQuestionsEvalEntry.model_validate(r)) if not evaluations: return KeyQuestionsEvalResult( evaluations=[], average_scores=DimensionScores( dimension_1_準確性=0, dimension_2_完整性=0, dimension_3_清晰度=0, dimension_4_簡潔性=0, ), average_total=0.0, ) n = len(evaluations) avg_scores = DimensionScores( dimension_1_準確性=round( sum(e.scores.dimension_1_準確性 for e in evaluations) / n, 1 ), dimension_2_完整性=round( sum(e.scores.dimension_2_完整性 for e in evaluations) / n, 1 ), dimension_3_清晰度=round( sum(e.scores.dimension_3_清晰度 for e in evaluations) / n, 1 ), dimension_4_簡潔性=round( sum(e.scores.dimension_4_簡潔性 for e in evaluations) / n, 1 ), ) avg_total = round(sum(e.total_score for e in evaluations) / n, 1) return KeyQuestionsEvalResult( evaluations=evaluations, average_scores=avg_scores, average_total=avg_total, )