221 lines
7.8 KiB
Python
221 lines
7.8 KiB
Python
import asyncio
|
||
import json
|
||
import logging
|
||
import os
|
||
import time
|
||
from typing import List, Optional
|
||
|
||
from app.models.testing import (
|
||
DimensionScores,
|
||
EvaluatorConfig,
|
||
KeyQuestionsEvalEntry,
|
||
KeyQuestionsEvalResult,
|
||
)
|
||
from app.services.llm_client import LLMClient
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
MAX_RETRIES = 3
|
||
RETRY_DELAYS = [2.0, 4.0, 8.0]
|
||
|
||
_MARKING_SCHEME_PROMPT = """你正在評估從文件中提取的關鍵問題的質量。
|
||
|
||
原文/轉錄文本:
|
||
{original_text}
|
||
|
||
提取的關鍵問題:
|
||
{extracted_questions}
|
||
|
||
請根據以下評分標準評估這些關鍵問題的質量:
|
||
|
||
| 維度 | 權重 | 滿分描述 | 扣分指引 |
|
||
|------|------|---------|---------|
|
||
| 1. 準確性 (Fidelity to Original) | 40分 | 完全忠於原發言的核心意思、數字、關鍵詞及邏輯,沒有扭曲、遺漏或添加原意沒有的內容。 | 意思走樣(如把「先後緩急」改成其他概念)→ 扣 10–20 分;數字錯誤或遺漏(如 1065 戶、889 戶)→ 扣 15–25 分;完全偏離原意 → 扣 30–40 分 |
|
||
| 2. 完整性 (Completeness) | 25分 | 涵蓋原發言中該部分的所有關鍵元素(問題 + 背景 + 目的),無明顯遺漏。 | 漏掉重要背景(如「當前財政緊張」)→ 扣 8–12 分;只問一半(例如只問「可否先處理主幹道」,漏掉「後處理單車徑」)→ 扣 10–18 分;完全只剩一句問句 → 扣 20 分以上 |
|
||
| 3. 清晰度 (Clarity) | 20分 | 語言精準、邏輯清楚、易讀易懂,問題焦點一目了然,適合正式會議場合使用。 | 句子過長或結構混亂 → 扣 6–10 分;出現歧義或模糊詞 → 扣 10–15 分;完全看不懂重點 → 扣 16–20 分 |
|
||
| 4. 簡潔性 (Conciseness) | 15分 | 用最少的字數表達最完整的意思,無多餘贅詞,適合口頭提問或書面記錄。 | 過於冗長(比原發言還長)→ 扣 6–10 分;過度簡化導致意思不全 → 扣 8–13 分 |
|
||
|
||
請返回JSON格式,包含以下字段:
|
||
- dimension_1_準確性: 整數 (0-40)
|
||
- dimension_2_完整性: 整數 (0-25)
|
||
- dimension_3_清晰度: 整數 (0-20)
|
||
- dimension_4_簡潔性: 整數 (0-15)
|
||
- comments: 簡要評語
|
||
"""
|
||
|
||
|
||
def _build_eval_prompt(original_text: str, extracted_questions: List[str]) -> str:
|
||
questions_str = "\n".join(
|
||
f" {i + 1}. {q}" for i, q in enumerate(extracted_questions)
|
||
)
|
||
return _MARKING_SCHEME_PROMPT.format(
|
||
original_text=original_text,
|
||
extracted_questions=questions_str,
|
||
)
|
||
|
||
|
||
def _parse_score_response(raw: str, model_name: str) -> Optional[dict]:
|
||
try:
|
||
data = json.loads(raw)
|
||
except json.JSONDecodeError:
|
||
logger.warning("Evaluator %s returned invalid JSON: %.200s", model_name, raw)
|
||
return None
|
||
|
||
required = [
|
||
"dimension_1_準確性",
|
||
"dimension_2_完整性",
|
||
"dimension_3_清晰度",
|
||
"dimension_4_簡潔性",
|
||
]
|
||
if not all(k in data for k in required):
|
||
logger.warning("Evaluator %s missing required keys: %s", model_name, set(required) - set(data.keys()))
|
||
return None
|
||
|
||
return data
|
||
|
||
|
||
async def _run_single_evaluator(
|
||
config: EvaluatorConfig,
|
||
prompt: str,
|
||
model_idx: int,
|
||
) -> Optional[dict]:
|
||
api_key = os.environ.get(config.api_key_env, "")
|
||
if not api_key:
|
||
logger.error("API key not found for env var: %s", config.api_key_env)
|
||
return None
|
||
|
||
client = LLMClient.__new__(LLMClient)
|
||
client.settings = type("_Settings", (), {"vllm_engine": False, "llm_enable_thinking": config.enable_thinking})()
|
||
client.model = config.model_name
|
||
client.enable_thinking = config.enable_thinking
|
||
client.logger = logging.getLogger(f"{__name__}.evaluator_{model_idx}")
|
||
|
||
import httpx
|
||
from openai import AsyncOpenAI
|
||
|
||
client._client = AsyncOpenAI(
|
||
base_url=config.base_url.rstrip("/"),
|
||
api_key=api_key,
|
||
timeout=60.0,
|
||
http_client=httpx.AsyncClient(
|
||
headers={"Content-Type": "application/json"},
|
||
),
|
||
)
|
||
client._langchain_model = None
|
||
|
||
for attempt in range(MAX_RETRIES):
|
||
try:
|
||
step_name = f"Eval-{config.model_name}"
|
||
start = time.perf_counter()
|
||
raw = await client.complete(
|
||
prompt=prompt,
|
||
temperature=0.3,
|
||
step_name=step_name,
|
||
)
|
||
elapsed_ms = int((time.perf_counter() - start) * 1000)
|
||
|
||
parsed = _parse_score_response(raw, config.model_name)
|
||
if parsed is not None:
|
||
scores = DimensionScores(
|
||
dimension_1_準確性=int(parsed["dimension_1_準確性"]),
|
||
dimension_2_完整性=int(parsed["dimension_2_完整性"]),
|
||
dimension_3_清晰度=int(parsed["dimension_3_清晰度"]),
|
||
dimension_4_簡潔性=int(parsed["dimension_4_簡潔性"]),
|
||
)
|
||
total = (
|
||
scores.dimension_1_準確性
|
||
+ scores.dimension_2_完整性
|
||
+ scores.dimension_3_清晰度
|
||
+ scores.dimension_4_簡潔性
|
||
)
|
||
return KeyQuestionsEvalEntry(
|
||
model_name=config.model_name,
|
||
scores=scores,
|
||
total_score=total,
|
||
max_score=100,
|
||
comments=parsed.get("comments", ""),
|
||
thinking_trace="",
|
||
time_ms=elapsed_ms,
|
||
).model_dump()
|
||
|
||
except Exception as exc:
|
||
logger.warning(
|
||
"Evaluator %s attempt %d/%d failed: %s",
|
||
config.model_name,
|
||
attempt + 1,
|
||
MAX_RETRIES,
|
||
exc,
|
||
)
|
||
|
||
if attempt < MAX_RETRIES - 1:
|
||
await asyncio.sleep(RETRY_DELAYS[attempt])
|
||
|
||
return None
|
||
|
||
|
||
async def evaluate_key_questions(
|
||
original_text: str,
|
||
extracted_questions: List[str],
|
||
evaluator_configs: List[EvaluatorConfig],
|
||
) -> KeyQuestionsEvalResult:
|
||
if not evaluator_configs:
|
||
return KeyQuestionsEvalResult(
|
||
evaluations=[],
|
||
average_scores=DimensionScores(
|
||
dimension_1_準確性=0,
|
||
dimension_2_完整性=0,
|
||
dimension_3_清晰度=0,
|
||
dimension_4_簡潔性=0,
|
||
),
|
||
average_total=0.0,
|
||
)
|
||
|
||
prompt = _build_eval_prompt(original_text, extracted_questions)
|
||
|
||
results_raw = await asyncio.gather(
|
||
*[
|
||
_run_single_evaluator(cfg, prompt, i)
|
||
for i, cfg in enumerate(evaluator_configs)
|
||
]
|
||
)
|
||
|
||
evaluations = []
|
||
for r in results_raw:
|
||
if r is not None:
|
||
evaluations.append(KeyQuestionsEvalEntry.model_validate(r))
|
||
|
||
if not evaluations:
|
||
return KeyQuestionsEvalResult(
|
||
evaluations=[],
|
||
average_scores=DimensionScores(
|
||
dimension_1_準確性=0,
|
||
dimension_2_完整性=0,
|
||
dimension_3_清晰度=0,
|
||
dimension_4_簡潔性=0,
|
||
),
|
||
average_total=0.0,
|
||
)
|
||
|
||
n = len(evaluations)
|
||
avg_scores = DimensionScores(
|
||
dimension_1_準確性=round(
|
||
sum(e.scores.dimension_1_準確性 for e in evaluations) / n, 1
|
||
),
|
||
dimension_2_完整性=round(
|
||
sum(e.scores.dimension_2_完整性 for e in evaluations) / n, 1
|
||
),
|
||
dimension_3_清晰度=round(
|
||
sum(e.scores.dimension_3_清晰度 for e in evaluations) / n, 1
|
||
),
|
||
dimension_4_簡潔性=round(
|
||
sum(e.scores.dimension_4_簡潔性 for e in evaluations) / n, 1
|
||
),
|
||
)
|
||
avg_total = round(sum(e.total_score for e in evaluations) / n, 1)
|
||
|
||
return KeyQuestionsEvalResult(
|
||
evaluations=evaluations,
|
||
average_scores=avg_scores,
|
||
average_total=avg_total,
|
||
)
|