feat: add Sub-Phase 9.2 evaluation engine (CER/WER, key questions, chunk, response)

2026-05-25 18:45:53 +08:00 · 2026-05-25 18:45:53 +08:00 · 098be359e7
parent ac81df0704
commit 098be359e7
8 changed files with 1010 additions and 4 deletions
--- a/backend/app/models/testing.py
+++ b/backend/app/models/testing.py
@ -121,10 +121,10 @@ class GenerateResult(BaseModel):
 class DimensionScores(BaseModel):
-    dimension_1_準確性: int = Field(ge=0, le=40)
+    dimension_1_準確性: float = Field(ge=0, le=40)
-    dimension_2_完整性: int = Field(ge=0, le=25)
+    dimension_2_完整性: float = Field(ge=0, le=25)
-    dimension_3_清晰度: int = Field(ge=0, le=20)
+    dimension_3_清晰度: float = Field(ge=0, le=20)
-    dimension_4_簡潔性: int = Field(ge=0, le=15)
+    dimension_4_簡潔性: float = Field(ge=0, le=15)
 class KeyQuestionsEvalEntry(BaseModel):
--- a/backend/app/services/cer_wer.py
+++ b/backend/app/services/cer_wer.py
@ -0,0 +1,156 @@
 def _levenshtein_distance(s1: str, s2: str) -> tuple:
    """Compute Levenshtein distance and return edit operation counts.
    Returns (substitutions, deletions, insertions, hits).
    """
    if not s1 and not s2:
        return 0, 0, 0, 0
    if not s1:
        return 0, len(s2), 0, 0
    if not s2:
        return 0, 0, len(s1), 0
    m, n = len(s1), len(s2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if s1[i - 1] == s2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(
                    dp[i - 1][j],
                    dp[i][j - 1],
                    dp[i - 1][j - 1],
                )
    i, j = m, n
    substitutions = 0
    deletions = 0
    insertions = 0
    hits = 0
    while i > 0 or j > 0:
        if i > 0 and j > 0 and s1[i - 1] == s2[j - 1]:
            hits += 1
            i -= 1
            j -= 1
        elif i > 0 and j > 0 and dp[i][j] == dp[i - 1][j - 1] + 1:
            substitutions += 1
            i -= 1
            j -= 1
        elif i > 0 and dp[i][j] == dp[i - 1][j] + 1:
            deletions += 1
            i -= 1
        elif j > 0:
            insertions += 1
            j -= 1
    return substitutions, deletions, insertions, hits
 def _tokenize_words(text: str) -> list:
    """Simple word tokenizer for mixed Chinese/English text.
    Splits on whitespace. For character-level CER, use the raw string.
    For word-level WER, this gives reasonable results for space-separated text.
    """
    return text.split()
 def calculate_cer(reference: str, hypothesis: str) -> dict:
    """Calculate Character Error Rate (CER) between reference and hypothesis.
    Returns dict with keys: cer, reference_length, transcribed_length,
    substitutions, deletions, insertions, hits.
    """
    ref_len = len(reference)
    hyp_len = len(hypothesis)
    if ref_len == 0:
        return {
            "cer": 0.0,
            "reference_length": 0,
            "transcribed_length": hyp_len,
            "substitutions": 0,
            "deletions": 0,
            "insertions": 0,
            "hits": 0,
        }
    if hyp_len == 0:
        return {
            "cer": 1.0,
            "reference_length": ref_len,
            "transcribed_length": 0,
            "substitutions": 0,
            "deletions": ref_len,
            "insertions": 0,
            "hits": 0,
        }
    subs, dels, inss, hits = _levenshtein_distance(reference, hypothesis)
    cer = (subs + dels + inss) / max(1, ref_len)
    return {
        "cer": round(cer, 6),
        "reference_length": ref_len,
        "transcribed_length": hyp_len,
        "substitutions": subs,
        "deletions": dels,
        "insertions": inss,
        "hits": hits,
    }
 def calculate_wer(reference: str, hypothesis: str) -> dict:
    """Calculate Word Error Rate (WER) between reference and hypothesis.
    Returns dict with keys: wer, reference_length, transcribed_length,
    substitutions, deletions, insertions, hits.
    """
    ref_words = _tokenize_words(reference)
    hyp_words = _tokenize_words(hypothesis)
    ref_len = len(ref_words)
    hyp_len = len(hyp_words)
    if ref_len == 0:
        return {
            "wer": 0.0,
            "reference_length": 0,
            "transcribed_length": hyp_len,
            "substitutions": 0,
            "deletions": 0,
            "insertions": 0,
            "hits": 0,
        }
    if hyp_len == 0:
        return {
            "wer": 1.0,
            "reference_length": ref_len,
            "transcribed_length": 0,
            "substitutions": 0,
            "deletions": ref_len,
            "insertions": 0,
            "hits": 0,
        }
    subs, dels, inss, hits = _levenshtein_distance(ref_words, hyp_words)
    wer = (subs + dels + inss) / max(1, ref_len)
    return {
        "wer": round(wer, 6),
        "reference_length": ref_len,
        "transcribed_length": hyp_len,
        "substitutions": subs,
        "deletions": dels,
        "insertions": inss,
        "hits": hits,
    }
--- a/backend/app/services/chunk_evaluator.py
+++ b/backend/app/services/chunk_evaluator.py
@ -0,0 +1,169 @@
 import asyncio
 import json
 import logging
 import os
 import time
 from typing import Any, Dict, List, Optional, Set, Tuple
 from app.models.testing import (
    ChunkAccuracy,
    EvaluatorConfig,
    GroundTruthInfo,
    SubQuestionChunkEval,
 )
 from app.services.llm_client import LLMClient
 logger = logging.getLogger(__name__)
 CHUNK_BATCH_SIZE = 10
 CHUNK_MAX_RETRIES = 2
 CHUNK_RETRY_DELAY = 2.0
 _CHUNK_EVAL_SYSTEM = """你正在評估文檔塊與關鍵問題的相關性。
 對於每個<chunk_N>，判斷其是否包含與<sub_question>相關的信息。
 返回JSON：{"relevant_chunk_indices": [0, 3, 7]}（僅包含相關的塊索引，0-based，從本批次的第一個塊算起）"""
 def _split_into_batches(
    chunks: List[Tuple[str, int, str, Dict[str, Any]]], batch_size: int = CHUNK_BATCH_SIZE
 ) -> List[List[Tuple[str, int, str, Dict[str, Any]]]]:
    """Split flat chunk list into batches of batch_size."""
    batches = []
    for i in range(0, len(chunks), batch_size):
        batches.append(chunks[i : i + batch_size])
    return batches
 def _parse_relevance_response(raw: str) -> Optional[List[int]]:
    """Parse LLM response for chunk relevance indices."""
    try:
        data = json.loads(raw)
    except json.JSONDecodeError:
        return None
    if not isinstance(data, dict) or "relevant_chunk_indices" not in data:
        return None
    indices = data["relevant_chunk_indices"]
    if not isinstance(indices, list):
        return None
    return [int(i) for i in indices]
 def _build_chunk_batch_prompt(
    sub_question: str, batch: List[Tuple[str, int, str, Dict[str, Any]]]
 ) -> str:
    """Build XML-format prompt for chunk evaluation."""
    parts = []
    parts.append(_CHUNK_EVAL_SYSTEM)
    parts.append("")
    parts.append(f"<sub_question>")
    parts.append(sub_question)
    parts.append(f"</sub_question>")
    parts.append("")
    for idx, (doc_id, global_idx, text, meta) in enumerate(batch):
        page = meta.get("page_number", "?")
        parts.append(f'<chunk_{idx} doc="{doc_id}" page="{page}">')
        parts.append(text)
        parts.append(f"</chunk_{idx}>")
        parts.append("")
    return "\n".join(parts)
 def _make_eval_client(config: EvaluatorConfig, model_idx: int) -> LLMClient:
    api_key = os.environ.get(config.api_key_env, "")
    client = LLMClient.__new__(LLMClient)
    client.settings = type("_Settings", (), {"vllm_engine": False, "llm_enable_thinking": config.enable_thinking})()
    client.model = config.model_name
    client.enable_thinking = config.enable_thinking
    client.logger = logging.getLogger(f"{__name__}.eval_{model_idx}")
    import httpx
    from openai import AsyncOpenAI
    client._client = AsyncOpenAI(
        base_url=config.base_url.rstrip("/"),
        api_key=api_key,
        timeout=120.0,
        http_client=httpx.AsyncClient(headers={"Content-Type": "application/json"}),
    )
    client._langchain_model = None
    return client
 async def _evaluate_batch(
    client: LLMClient, prompt: str, retries: int = CHUNK_MAX_RETRIES
 ) -> Optional[List[int]]:
    for attempt in range(retries + 1):
        try:
            raw = await client.complete(prompt=prompt, temperature=0.1, step_name="ChunkEval")
            result = _parse_relevance_response(raw)
            if result is not None:
                return result
        except Exception as exc:
            logger.warning("Chunk batch eval attempt %d failed: %s", attempt + 1, exc)
        if attempt < retries:
            await asyncio.sleep(CHUNK_RETRY_DELAY)
    return None
 async def _determine_ground_truth_chunks(
    sub_question: str,
    all_chunks: List[Tuple[str, int, str, Dict[str, Any]]],
    config: EvaluatorConfig,
    semaphore: asyncio.Semaphore,
    model_idx: int = 0,
    batch_size: int = CHUNK_BATCH_SIZE,
 ) -> Tuple[Set[Tuple[str, int]], int, int]:
    """Determine which chunks are relevant to a key question.
    Returns (ground_truth_set, total_chunks, elapsed_ms).
    """
    start = time.perf_counter()
    batches = _split_into_batches(all_chunks, batch_size)
    client = _make_eval_client(config, model_idx)
    async def _eval_with_limit(batch):
        async with semaphore:
            prompt = _build_chunk_batch_prompt(sub_question, batch)
            return await _evaluate_batch(client, prompt)
    batch_results = await asyncio.gather(*[_eval_with_limit(b) for b in batches])
    ground_truth: Set[Tuple[str, int]] = set()
    for batch, result in zip(batches, batch_results):
        if result is None:
            continue
        for batch_local_idx in result:
            if 0 <= batch_local_idx < len(batch):
                doc_id = batch[batch_local_idx][0]
                chunk_global_idx = batch[batch_local_idx][1]
                ground_truth.add((doc_id, chunk_global_idx))
    elapsed_ms = int((time.perf_counter() - start) * 1000)
    return ground_truth, len(all_chunks), elapsed_ms
 def _calculate_accuracy(
    pipeline_chunks: Set[Tuple[str, int]], ground_truth: Set[Tuple[str, int]]
 ) -> ChunkAccuracy:
    """Calculate precision, recall, F1 for chunk comparison."""
    if not pipeline_chunks:
        return ChunkAccuracy(precision=0.0, recall=0.0, f1=0.0, pipeline_chunks=0, relevant_in_pipeline=0)
    tp = len(pipeline_chunks & ground_truth)
    precision = tp / len(pipeline_chunks) if pipeline_chunks else 0.0
    recall = tp / len(ground_truth) if ground_truth else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    return ChunkAccuracy(
        precision=round(precision, 4),
        recall=round(recall, 4),
        f1=round(f1, 4),
        pipeline_chunks=len(pipeline_chunks),
        relevant_in_pipeline=tp,
    )
--- a/backend/app/services/key_questions_evaluator.py
+++ b/backend/app/services/key_questions_evaluator.py
@ -0,0 +1,220 @@
 import asyncio
 import json
 import logging
 import os
 import time
 from typing import List, Optional
 from app.models.testing import (
    DimensionScores,
    EvaluatorConfig,
    KeyQuestionsEvalEntry,
    KeyQuestionsEvalResult,
 )
 from app.services.llm_client import LLMClient
 logger = logging.getLogger(__name__)
 MAX_RETRIES = 3
 RETRY_DELAYS = [2.0, 4.0, 8.0]
 _MARKING_SCHEME_PROMPT = """你正在評估從文件中提取的關鍵問題的質量。
 原文/轉錄文本：
 {original_text}
 提取的關鍵問題：
 {extracted_questions}
 請根據以下評分標準評估這些關鍵問題的質量：
 | 維度 | 權重 | 滿分描述 | 扣分指引 |
 |------|------|---------|---------|
 | 1. 準確性 (Fidelity to Original) | 40分 | 完全忠於原發言的核心意思、數字、關鍵詞及邏輯，沒有扭曲、遺漏或添加原意沒有的內容。 | 意思走樣（如把「先後緩急」改成其他概念）→ 扣 10–20 分；數字錯誤或遺漏（如 1065 戶、889 戶）→ 扣 15–25 分；完全偏離原意 → 扣 30–40 分 |
 | 2. 完整性 (Completeness) | 25分 | 涵蓋原發言中該部分的所有關鍵元素（問題 + 背景 + 目的），無明顯遺漏。 | 漏掉重要背景（如「當前財政緊張」）→ 扣 8–12 分；只問一半（例如只問「可否先處理主幹道」，漏掉「後處理單車徑」）→ 扣 10–18 分；完全只剩一句問句 → 扣 20 分以上 |
 | 3. 清晰度 (Clarity) | 20分 | 語言精準、邏輯清楚、易讀易懂，問題焦點一目了然，適合正式會議場合使用。 | 句子過長或結構混亂 → 扣 6–10 分；出現歧義或模糊詞 → 扣 10–15 分；完全看不懂重點 → 扣 16–20 分 |
 | 4. 簡潔性 (Conciseness) | 15分 | 用最少的字數表達最完整的意思，無多餘贅詞，適合口頭提問或書面記錄。 | 過於冗長（比原發言還長）→ 扣 6–10 分；過度簡化導致意思不全 → 扣 8–13 分 |
 請返回JSON格式，包含以下字段：
 - dimension_1_準確性: 整數 (0-40)
 - dimension_2_完整性: 整數 (0-25)
 - dimension_3_清晰度: 整數 (0-20)
 - dimension_4_簡潔性: 整數 (0-15)
 - comments: 簡要評語
 """
 def _build_eval_prompt(original_text: str, extracted_questions: List[str]) -> str:
    questions_str = "\n".join(
        f"  {i + 1}. {q}" for i, q in enumerate(extracted_questions)
    )
    return _MARKING_SCHEME_PROMPT.format(
        original_text=original_text,
        extracted_questions=questions_str,
    )
 def _parse_score_response(raw: str, model_name: str) -> Optional[dict]:
    try:
        data = json.loads(raw)
    except json.JSONDecodeError:
        logger.warning("Evaluator %s returned invalid JSON: %.200s", model_name, raw)
        return None
    required = [
        "dimension_1_準確性",
        "dimension_2_完整性",
        "dimension_3_清晰度",
        "dimension_4_簡潔性",
    ]
    if not all(k in data for k in required):
        logger.warning("Evaluator %s missing required keys: %s", model_name, set(required) - set(data.keys()))
        return None
    return data
 async def _run_single_evaluator(
    config: EvaluatorConfig,
    prompt: str,
    model_idx: int,
 ) -> Optional[dict]:
    api_key = os.environ.get(config.api_key_env, "")
    if not api_key:
        logger.error("API key not found for env var: %s", config.api_key_env)
        return None
    client = LLMClient.__new__(LLMClient)
    client.settings = type("_Settings", (), {"vllm_engine": False, "llm_enable_thinking": config.enable_thinking})()
    client.model = config.model_name
    client.enable_thinking = config.enable_thinking
    client.logger = logging.getLogger(f"{__name__}.evaluator_{model_idx}")
    import httpx
    from openai import AsyncOpenAI
    client._client = AsyncOpenAI(
        base_url=config.base_url.rstrip("/"),
        api_key=api_key,
        timeout=60.0,
        http_client=httpx.AsyncClient(
            headers={"Content-Type": "application/json"},
        ),
    )
    client._langchain_model = None
    for attempt in range(MAX_RETRIES):
        try:
            step_name = f"Eval-{config.model_name}"
            start = time.perf_counter()
            raw = await client.complete(
                prompt=prompt,
                temperature=0.3,
                step_name=step_name,
            )
            elapsed_ms = int((time.perf_counter() - start) * 1000)
            parsed = _parse_score_response(raw, config.model_name)
            if parsed is not None:
                scores = DimensionScores(
                    dimension_1_準確性=int(parsed["dimension_1_準確性"]),
                    dimension_2_完整性=int(parsed["dimension_2_完整性"]),
                    dimension_3_清晰度=int(parsed["dimension_3_清晰度"]),
                    dimension_4_簡潔性=int(parsed["dimension_4_簡潔性"]),
                )
                total = (
                    scores.dimension_1_準確性
                    + scores.dimension_2_完整性
                    + scores.dimension_3_清晰度
                    + scores.dimension_4_簡潔性
                )
                return KeyQuestionsEvalEntry(
                    model_name=config.model_name,
                    scores=scores,
                    total_score=total,
                    max_score=100,
                    comments=parsed.get("comments", ""),
                    thinking_trace="",
                    time_ms=elapsed_ms,
                ).model_dump()
        except Exception as exc:
            logger.warning(
                "Evaluator %s attempt %d/%d failed: %s",
                config.model_name,
                attempt + 1,
                MAX_RETRIES,
                exc,
            )
        if attempt < MAX_RETRIES - 1:
            await asyncio.sleep(RETRY_DELAYS[attempt])
    return None
 async def evaluate_key_questions(
    original_text: str,
    extracted_questions: List[str],
    evaluator_configs: List[EvaluatorConfig],
 ) -> KeyQuestionsEvalResult:
    if not evaluator_configs:
        return KeyQuestionsEvalResult(
            evaluations=[],
            average_scores=DimensionScores(
                dimension_1_準確性=0,
                dimension_2_完整性=0,
                dimension_3_清晰度=0,
                dimension_4_簡潔性=0,
            ),
            average_total=0.0,
        )
    prompt = _build_eval_prompt(original_text, extracted_questions)
    results_raw = await asyncio.gather(
        *[
            _run_single_evaluator(cfg, prompt, i)
            for i, cfg in enumerate(evaluator_configs)
        ]
    )
    evaluations = []
    for r in results_raw:
        if r is not None:
            evaluations.append(KeyQuestionsEvalEntry.model_validate(r))
    if not evaluations:
        return KeyQuestionsEvalResult(
            evaluations=[],
            average_scores=DimensionScores(
                dimension_1_準確性=0,
                dimension_2_完整性=0,
                dimension_3_清晰度=0,
                dimension_4_簡潔性=0,
            ),
            average_total=0.0,
        )
    n = len(evaluations)
    avg_scores = DimensionScores(
        dimension_1_準確性=round(
            sum(e.scores.dimension_1_準確性 for e in evaluations) / n, 1
        ),
        dimension_2_完整性=round(
            sum(e.scores.dimension_2_完整性 for e in evaluations) / n, 1
        ),
        dimension_3_清晰度=round(
            sum(e.scores.dimension_3_清晰度 for e in evaluations) / n, 1
        ),
        dimension_4_簡潔性=round(
            sum(e.scores.dimension_4_簡潔性 for e in evaluations) / n, 1
        ),
    )
    avg_total = round(sum(e.total_score for e in evaluations) / n, 1)
    return KeyQuestionsEvalResult(
        evaluations=evaluations,
        average_scores=avg_scores,
        average_total=avg_total,
    )
--- a/backend/app/services/response_evaluator.py
+++ b/backend/app/services/response_evaluator.py
@ -0,0 +1,119 @@
 import json
 import logging
 import os
 import time
 from typing import Any, Dict, List, Optional, Tuple
 from app.models.testing import (
    EvaluatorConfig,
    SubQuestionResponseEval,
 )
 from app.services.llm_client import LLMClient
 logger = logging.getLogger(__name__)
 _RESPONSE_GEN_PROMPT = """使用以下文檔塊回答關鍵問題。僅使用提供的文檔塊信息，不要使用外部知識。在答案中引用來源。
 關鍵問題：{key_question}
 文檔塊：
 {chunks}
 回答："""
 _RESPONSE_COMPARE_PROMPT = """比較以下兩個回答的完整性和事實準確性。
 關鍵問題：{key_question}
 回答 A（基準答案，從相關塊生成）：
 {ground_truth_response}
 回答 B（要評估的答案）：
 {pipeline_response}
 請評估回答 B 是否包含回答 A 中的所有關鍵信息。返回JSON格式：
 {{"completeness_score": 0.0-1.0, "factual_accuracy_score": 0.0-1.0, "comments": "簡要評語"}}"""
 def _make_eval_client(config: EvaluatorConfig) -> LLMClient:
    api_key = os.environ.get(config.api_key_env, "")
    client = LLMClient.__new__(LLMClient)
    client.settings = type("_Settings", (), {"vllm_engine": False, "llm_enable_thinking": config.enable_thinking})()
    client.model = config.model_name
    client.enable_thinking = config.enable_thinking
    client.logger = logging.getLogger(f"{__name__}.resp_eval")
    import httpx
    from openai import AsyncOpenAI
    client._client = AsyncOpenAI(
        base_url=config.base_url.rstrip("/"),
        api_key=api_key,
        timeout=120.0,
        http_client=httpx.AsyncClient(headers={"Content-Type": "application/json"}),
    )
    client._langchain_model = None
    return client
 async def evaluate_response(
    key_question: str,
    ground_truth_chunks: List[Tuple[str, Dict[str, Any]]],
    pipeline_response: str,
    evaluator_config: EvaluatorConfig,
 ) -> Optional[SubQuestionResponseEval]:
    client = _make_eval_client(evaluator_config)
    # Step 1: Generate ground truth response from relevant chunks
    gen_start = time.perf_counter()
    chunks_text = "\n\n".join(
        f"[{meta.get('filename', 'unknown')}, page {meta.get('page_number', '?')}]\n{text}"
        for text, meta in ground_truth_chunks
    )
    gen_prompt = _RESPONSE_GEN_PROMPT.format(key_question=key_question, chunks=chunks_text)
    try:
        ground_truth_response = await client.complete(
            prompt=gen_prompt, temperature=0.3, step_name="ResponseGen-GroundTruth"
        )
    except Exception as exc:
        logger.warning("Failed to generate ground truth response: %s", exc)
        return None
    gen_time_ms = int((time.perf_counter() - gen_start) * 1000)
    # Step 2: Compare responses
    comp_start = time.perf_counter()
    comp_prompt = _RESPONSE_COMPARE_PROMPT.format(
        key_question=key_question,
        ground_truth_response=ground_truth_response,
        pipeline_response=pipeline_response,
    )
    try:
        raw = await client.complete(
            prompt=comp_prompt, temperature=0.3, step_name="ResponseCompare"
        )
        data = json.loads(raw)
    except Exception as exc:
        logger.warning("Failed to compare responses: %s", exc)
        return None
    comp_time_ms = int((time.perf_counter() - comp_start) * 1000)
    completeness = float(data.get("completeness_score", 0.0))
    factual = float(data.get("factual_accuracy_score", 0.0))
    comments = data.get("comments", "")
    return SubQuestionResponseEval(
        sub_question_index=0,
        sub_question_text=key_question,
        ground_truth_response=ground_truth_response,
        pipeline_response_section=pipeline_response,
        completeness_score=round(completeness, 4),
        factual_accuracy_score=round(factual, 4),
        comments=comments,
        ground_truth_generation_time_ms=gen_time_ms,
        comparison_time_ms=comp_time_ms,
    )
--- a/backend/app/test/test_phase9_cer_wer.py
+++ b/backend/app/test/test_phase9_cer_wer.py
@ -0,0 +1,83 @@
 """Phase 9 tests: CER/WER calculation for transcription accuracy (Sub-Phase 9.2).
 Covers:
 - CER for identical Chinese text returns 0.0
 - CER for single-character substitution
 - CER for deletions and insertions
 - WER for Chinese text (word-level)
 - Mixed Chinese/English text
 - Empty reference and empty hypothesis edge cases
 - N/A status when reference transcript is missing
 """
 import pytest
 from app.services.cer_wer import calculate_cer, calculate_wer
 class TestCER:
    def test_identical_returns_zero(self):
        result = calculate_cer("立法會今日討論", "立法會今日討論")
        assert result["cer"] == 0.0
        assert result["substitutions"] == 0
        assert result["deletions"] == 0
        assert result["insertions"] == 0
        assert result["hits"] == 7
    def test_single_substitution(self):
        result = calculate_cer("立法會今日討論", "立法會昨日討論")
        assert result["cer"] > 0.0
        assert result["substitutions"] == 1
        assert result["hits"] == 6
    def test_deletion(self):
        result = calculate_cer("立法會討論議題", "立法會討論")
        assert result["deletions"] >= 1
        assert result["cer"] > 0.0
    def test_insertion(self):
        result = calculate_cer("立法會討論", "立法會今日討論")
        assert result["insertions"] >= 1
        assert result["cer"] > 0.0
    def test_empty_reference(self):
        result = calculate_cer("", "something")
        assert result["cer"] == 0.0
        assert result["reference_length"] == 0
    def test_empty_hypothesis(self):
        result = calculate_cer("立法會", "")
        assert result["cer"] == 1.0
        assert result["deletions"] == 3
    def test_both_empty(self):
        result = calculate_cer("", "")
        assert result["cer"] == 0.0
    def test_returns_all_fields(self):
        result = calculate_cer("立法會討論", "立法會討論")
        for key in ("cer", "reference_length", "transcribed_length",
                    "substitutions", "deletions", "insertions", "hits"):
            assert key in result
 class TestWER:
    def test_identical_returns_zero(self):
        result = calculate_wer("立法會 今日 討論", "立法會 今日 討論")
        assert result["wer"] == 0.0
    def test_word_substitution(self):
        result = calculate_wer("立法會 今日 討論", "立法會 昨日 討論")
        assert result["wer"] > 0.0
        assert result["substitutions"] == 1
    def test_mixed_cn_en(self):
        result = calculate_wer("LegCo 討論 議題", "LegCo 討論 政策")
        assert result["substitutions"] == 1
    def test_empty_reference(self):
        result = calculate_wer("", "something")
        assert result["wer"] == 0.0
    def test_empty_hypothesis(self):
        result = calculate_wer("立法會 討論", "")
        assert result["wer"] == 1.0
--- a/backend/app/test/test_phase9_chunk_response_eval.py
+++ b/backend/app/test/test_phase9_chunk_response_eval.py
@ -0,0 +1,141 @@
 """Phase 9 tests: Chunk and response evaluation (Sub-Phase 9.2)."""
 import json
 from unittest.mock import AsyncMock, patch
 import pytest
 from app.models.testing import (
    ChunkAccuracy,
    EvaluatorConfig,
    GroundTruthInfo,
    SubQuestionChunkEval,
    SubQuestionResponseEval,
 )
@pytest.fixture(autouse=True)
 def _set_api_keys(monkeypatch):
    monkeypatch.setenv("LLM_API_KEY", "test-key")
@pytest.fixture
 def chunk_evaluator_config():
    return EvaluatorConfig(
        model_name="qwen/qwen3.6-35b-a3b",
        base_url="https://test.example.com/v1",
        api_key_env="LLM_API_KEY",
        enable_thinking=True,
    )
@pytest.fixture
 def sample_chunks_by_doc():
    return {
        "doc-1": [
            ("chunk 0 doc1 text about立法會", {"filename": "doc1.pdf", "chunk_index": 0, "document_id": "doc-1", "page_number": 1, "upload_date": "2026-01-01", "content_summary": "立法會 text"}),
            ("chunk 1 doc1 irrelevant", {"filename": "doc1.pdf", "chunk_index": 1, "document_id": "doc-1", "page_number": 2, "upload_date": "2026-01-01", "content_summary": "irrelevant"}),
        ],
        "doc-2": [
            ("chunk 0 doc2 about 討論", {"filename": "doc2.pdf", "chunk_index": 0, "document_id": "doc-2", "page_number": 1, "upload_date": "2026-01-02", "content_summary": "討論 text"}),
        ],
    }
 class TestChunkEvaluator:
    @pytest.mark.asyncio
    async def test_batch_splitting(self, chunk_evaluator_config, sample_chunks_by_doc):
        mock_responses = [
            '{"relevant_chunk_indices": [0]}',
            '{"relevant_chunk_indices": [0]}',
        ]
        async def _mock_complete(*args, **kwargs):
            return mock_responses.pop(0)
        with patch("app.services.llm_client.LLMClient.complete", side_effect=_mock_complete):
            from app.services.chunk_evaluator import _split_into_batches
            all_chunks = [(doc_id, i, text, meta) for doc_id, chunks in sample_chunks_by_doc.items() for i, (text, meta) in enumerate(chunks)]
            batches = _split_into_batches(all_chunks, batch_size=2)
            assert len(batches) == 2
    @pytest.mark.asyncio
    async def test_relevance_from_json(self):
        from app.services.chunk_evaluator import _parse_relevance_response
        result = _parse_relevance_response('{"relevant_chunk_indices": [0, 2, 5]}')
        assert result == [0, 2, 5]
    @pytest.mark.asyncio
    async def test_relevance_empty_response(self):
        from app.services.chunk_evaluator import _parse_relevance_response
        result = _parse_relevance_response('{"relevant_chunk_indices": []}')
        assert result == []
    @pytest.mark.asyncio
    async def test_relevance_invalid_json(self):
        from app.services.chunk_evaluator import _parse_relevance_response
        result = _parse_relevance_response("not json")
        assert result is None
    @pytest.mark.asyncio
    async def test_precision_recall_f1_calculation(self):
        from app.services.chunk_evaluator import _calculate_accuracy
        retrieved = {("doc-1", 0), ("doc-1", 1)}
        ground_truth = {("doc-1", 0), ("doc-2", 0)}
        result = _calculate_accuracy(retrieved, ground_truth)
        assert result.precision == 0.5
        assert result.recall == 0.5
        assert result.f1 == 0.5
    @pytest.mark.asyncio
    async def test_perfect_accuracy(self):
        from app.services.chunk_evaluator import _calculate_accuracy
        result = _calculate_accuracy(
            {("doc-1", 0), ("doc-1", 1)},
            {("doc-1", 0), ("doc-1", 1)},
        )
        assert result.precision == 1.0
        assert result.recall == 1.0
        assert result.f1 == 1.0
    @pytest.mark.asyncio
    async def test_zero_precision(self):
        from app.services.chunk_evaluator import _calculate_accuracy
        result = _calculate_accuracy(
            {("doc-1", 0)},
            set(),
        )
        assert result.precision == 0.0
        assert result.recall == 0.0
 class TestResponseEvaluator:
    @pytest.mark.asyncio
    async def test_response_comparison(self):
        mock_gen_response = "## Sub-question 0\n\n- Test answer with citation [doc1.pdf, page 1]"
        async def _mock_complete(*args, **kwargs):
            prompt = kwargs.get("prompt", "")
            if "compare" in prompt.lower() or "completeness" in prompt.lower():
                return json.dumps({"completeness_score": 0.85, "factual_accuracy_score": 0.92, "comments": "good"})
            return mock_gen_response
        with patch("app.services.llm_client.LLMClient.complete", side_effect=_mock_complete):
            from app.services.response_evaluator import evaluate_response
            result = await evaluate_response(
                key_question="test question",
                ground_truth_chunks=[("relevant chunk text", {"filename": "doc1.pdf", "chunk_index": 0})],
                pipeline_response="pipeline answer",
                evaluator_config=EvaluatorConfig(
                    model_name="test", base_url="https://test.example.com", api_key_env="LLM_API_KEY", enable_thinking=True,
                ),
            )
        assert result is not None
        assert result.completeness_score == 0.85
        assert result.factual_accuracy_score == 0.92
--- a/backend/app/test/test_phase9_key_questions_eval.py
+++ b/backend/app/test/test_phase9_key_questions_eval.py
@ -0,0 +1,118 @@
 """Phase 9 tests: Key questions evaluation with dual-model scoring (Sub-Phase 9.2)."""
 import json
 from unittest.mock import AsyncMock, patch
 import pytest
 from app.models.testing import (
    EvaluatorConfig,
    KeyQuestionsEvalResult,
 )
@pytest.fixture
 def evaluator_configs():
    return [
        EvaluatorConfig(
            model_name="deepseek-v4-pro",
            base_url="https://api.deepseek.com",
            api_key_env="DP_API_KEY",
            enable_thinking=True,
        ),
        EvaluatorConfig(
            model_name="qwen3-7b-max",
            base_url="https://dashscope.example.com/v1",
            api_key_env="DASHSCOPE_API_KEY",
            enable_thinking=True,
        ),
    ]
@pytest.fixture(autouse=True)
 def _set_api_keys(monkeypatch):
    monkeypatch.setenv("DP_API_KEY", "test-dp-key")
    monkeypatch.setenv("DASHSCOPE_API_KEY", "test-dashscope-key")
@pytest.fixture
 def mock_successful_complete(monkeypatch):
    valid_scores = json.dumps({
        "dimension_1_準確性": 35,
        "dimension_2_完整性": 22,
        "dimension_3_清晰度": 18,
        "dimension_4_簡潔性": 13,
    })
    async def _mock(*args, **kwargs):
        return valid_scores
    monkeypatch.setattr(
        "app.services.llm_client.LLMClient.complete", _mock
    )
 class TestKeyQuestionsEvaluator:
    @pytest.mark.asyncio
    async def test_both_evaluators_succeed(self, evaluator_configs, mock_successful_complete):
        from app.services.key_questions_evaluator import evaluate_key_questions
        result = await evaluate_key_questions(
            original_text="test text",
            extracted_questions=["test q"],
            evaluator_configs=evaluator_configs,
        )
        assert isinstance(result, KeyQuestionsEvalResult)
        assert len(result.evaluations) == 2
    @pytest.mark.asyncio
    async def test_average_calculation(self, evaluator_configs):
        call_count = 0
        scores_sequence = [
            json.dumps({"dimension_1_準確性": 30, "dimension_2_完整性": 20, "dimension_3_清晰度": 15, "dimension_4_簡潔性": 10}),
            json.dumps({"dimension_1_準確性": 40, "dimension_2_完整性": 25, "dimension_3_清晰度": 20, "dimension_4_簡潔性": 15}),
        ]
        async def _mock_complete(**kwargs):
            nonlocal call_count
            result = scores_sequence[call_count]
            call_count += 1
            return result
        with patch("app.services.llm_client.LLMClient.complete", side_effect=_mock_complete):
            from app.services.key_questions_evaluator import evaluate_key_questions
            result = await evaluate_key_questions(
                original_text="test", extracted_questions=["q1", "q2"], evaluator_configs=evaluator_configs,
            )
        assert result.average_scores.dimension_1_準確性 == 35.0
        assert result.average_scores.dimension_2_完整性 == 22.5
    @pytest.mark.asyncio
    async def test_empty_evaluators(self):
        from app.services.key_questions_evaluator import evaluate_key_questions
        result = await evaluate_key_questions(
            original_text="test", extracted_questions=["test"], evaluator_configs=[],
        )
        assert result.evaluations == []
        assert result.average_total == 0.0
    @pytest.mark.asyncio
    async def test_prompt_contains_marking_scheme(self, evaluator_configs):
        captured_prompts = []
        async def _capture(**kwargs):
            captured_prompts.append(kwargs.get("prompt", ""))
            return json.dumps({"dimension_1_準確性": 30, "dimension_2_完整性": 20, "dimension_3_清晰度": 15, "dimension_4_簡潔性": 10})
        with patch("app.services.llm_client.LLMClient.complete", side_effect=_capture):
            from app.services.key_questions_evaluator import evaluate_key_questions
            await evaluate_key_questions(
                original_text="立法會今日討論", extracted_questions=["test q"], evaluator_configs=evaluator_configs,
            )
        assert len(captured_prompts) == 2
        assert "準確性" in captured_prompts[0]