legco_ai_assistant/backend/app/services/key_questions_evaluator.py

221 lines
7.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio
import json
import logging
import os
import time
from typing import List, Optional
from app.models.testing import (
DimensionScores,
EvaluatorConfig,
KeyQuestionsEvalEntry,
KeyQuestionsEvalResult,
)
from app.services.llm_client import LLMClient
logger = logging.getLogger(__name__)
MAX_RETRIES = 3
RETRY_DELAYS = [2.0, 4.0, 8.0]
_MARKING_SCHEME_PROMPT = """你正在評估從文件中提取的關鍵問題的質量。
原文/轉錄文本:
{original_text}
提取的關鍵問題:
{extracted_questions}
請根據以下評分標準評估這些關鍵問題的質量:
| 維度 | 權重 | 滿分描述 | 扣分指引 |
|------|------|---------|---------|
| 1. 準確性 (Fidelity to Original) | 40分 | 完全忠於原發言的核心意思、數字、關鍵詞及邏輯,沒有扭曲、遺漏或添加原意沒有的內容。 | 意思走樣(如把「先後緩急」改成其他概念)→ 扣 1020 分;數字錯誤或遺漏(如 1065 戶、889 戶)→ 扣 1525 分;完全偏離原意 → 扣 3040 分 |
| 2. 完整性 (Completeness) | 25分 | 涵蓋原發言中該部分的所有關鍵元素(問題 + 背景 + 目的),無明顯遺漏。 | 漏掉重要背景(如「當前財政緊張」)→ 扣 812 分;只問一半(例如只問「可否先處理主幹道」,漏掉「後處理單車徑」)→ 扣 1018 分;完全只剩一句問句 → 扣 20 分以上 |
| 3. 清晰度 (Clarity) | 20分 | 語言精準、邏輯清楚、易讀易懂,問題焦點一目了然,適合正式會議場合使用。 | 句子過長或結構混亂 → 扣 610 分;出現歧義或模糊詞 → 扣 1015 分;完全看不懂重點 → 扣 1620 分 |
| 4. 簡潔性 (Conciseness) | 15分 | 用最少的字數表達最完整的意思,無多餘贅詞,適合口頭提問或書面記錄。 | 過於冗長(比原發言還長)→ 扣 610 分;過度簡化導致意思不全 → 扣 813 分 |
請返回JSON格式包含以下字段
- dimension_1_準確性: 整數 (0-40)
- dimension_2_完整性: 整數 (0-25)
- dimension_3_清晰度: 整數 (0-20)
- dimension_4_簡潔性: 整數 (0-15)
- comments: 簡要評語
"""
def _build_eval_prompt(original_text: str, extracted_questions: List[str]) -> str:
questions_str = "\n".join(
f" {i + 1}. {q}" for i, q in enumerate(extracted_questions)
)
return _MARKING_SCHEME_PROMPT.format(
original_text=original_text,
extracted_questions=questions_str,
)
def _parse_score_response(raw: str, model_name: str) -> Optional[dict]:
try:
data = json.loads(raw)
except json.JSONDecodeError:
logger.warning("Evaluator %s returned invalid JSON: %.200s", model_name, raw)
return None
required = [
"dimension_1_準確性",
"dimension_2_完整性",
"dimension_3_清晰度",
"dimension_4_簡潔性",
]
if not all(k in data for k in required):
logger.warning("Evaluator %s missing required keys: %s", model_name, set(required) - set(data.keys()))
return None
return data
async def _run_single_evaluator(
config: EvaluatorConfig,
prompt: str,
model_idx: int,
) -> Optional[dict]:
api_key = os.environ.get(config.api_key_env, "")
if not api_key:
logger.error("API key not found for env var: %s", config.api_key_env)
return None
client = LLMClient.__new__(LLMClient)
client.settings = type("_Settings", (), {"vllm_engine": False, "llm_enable_thinking": config.enable_thinking})()
client.model = config.model_name
client.enable_thinking = config.enable_thinking
client.logger = logging.getLogger(f"{__name__}.evaluator_{model_idx}")
import httpx
from openai import AsyncOpenAI
client._client = AsyncOpenAI(
base_url=config.base_url.rstrip("/"),
api_key=api_key,
timeout=60.0,
http_client=httpx.AsyncClient(
headers={"Content-Type": "application/json"},
),
)
client._langchain_model = None
for attempt in range(MAX_RETRIES):
try:
step_name = f"Eval-{config.model_name}"
start = time.perf_counter()
raw = await client.complete(
prompt=prompt,
temperature=0.3,
step_name=step_name,
)
elapsed_ms = int((time.perf_counter() - start) * 1000)
parsed = _parse_score_response(raw, config.model_name)
if parsed is not None:
scores = DimensionScores(
dimension_1_準確性=int(parsed["dimension_1_準確性"]),
dimension_2_完整性=int(parsed["dimension_2_完整性"]),
dimension_3_清晰度=int(parsed["dimension_3_清晰度"]),
dimension_4_簡潔性=int(parsed["dimension_4_簡潔性"]),
)
total = (
scores.dimension_1_準確性
+ scores.dimension_2_完整性
+ scores.dimension_3_清晰度
+ scores.dimension_4_簡潔性
)
return KeyQuestionsEvalEntry(
model_name=config.model_name,
scores=scores,
total_score=total,
max_score=100,
comments=parsed.get("comments", ""),
thinking_trace="",
time_ms=elapsed_ms,
).model_dump()
except Exception as exc:
logger.warning(
"Evaluator %s attempt %d/%d failed: %s",
config.model_name,
attempt + 1,
MAX_RETRIES,
exc,
)
if attempt < MAX_RETRIES - 1:
await asyncio.sleep(RETRY_DELAYS[attempt])
return None
async def evaluate_key_questions(
original_text: str,
extracted_questions: List[str],
evaluator_configs: List[EvaluatorConfig],
) -> KeyQuestionsEvalResult:
if not evaluator_configs:
return KeyQuestionsEvalResult(
evaluations=[],
average_scores=DimensionScores(
dimension_1_準確性=0,
dimension_2_完整性=0,
dimension_3_清晰度=0,
dimension_4_簡潔性=0,
),
average_total=0.0,
)
prompt = _build_eval_prompt(original_text, extracted_questions)
results_raw = await asyncio.gather(
*[
_run_single_evaluator(cfg, prompt, i)
for i, cfg in enumerate(evaluator_configs)
]
)
evaluations = []
for r in results_raw:
if r is not None:
evaluations.append(KeyQuestionsEvalEntry.model_validate(r))
if not evaluations:
return KeyQuestionsEvalResult(
evaluations=[],
average_scores=DimensionScores(
dimension_1_準確性=0,
dimension_2_完整性=0,
dimension_3_清晰度=0,
dimension_4_簡潔性=0,
),
average_total=0.0,
)
n = len(evaluations)
avg_scores = DimensionScores(
dimension_1_準確性=round(
sum(e.scores.dimension_1_準確性 for e in evaluations) / n, 1
),
dimension_2_完整性=round(
sum(e.scores.dimension_2_完整性 for e in evaluations) / n, 1
),
dimension_3_清晰度=round(
sum(e.scores.dimension_3_清晰度 for e in evaluations) / n, 1
),
dimension_4_簡潔性=round(
sum(e.scores.dimension_4_簡潔性 for e in evaluations) / n, 1
),
)
avg_total = round(sum(e.total_score for e in evaluations) / n, 1)
return KeyQuestionsEvalResult(
evaluations=evaluations,
average_scores=avg_scores,
average_total=avg_total,
)