From 183fcf77720ec47d8b1171896ea2fe8079c5d885 Mon Sep 17 00:00:00 2001 From: Woody Date: Tue, 19 May 2026 09:48:12 +0800 Subject: [PATCH] refactor: make ASR client and video router provider-aware Refactor ASRClient to delegate to provider (DashScopeASRProvider or OpenRouterASRProvider) via create_asr_provider() factory. transcribe_full() now async. Move _to_traditional to asr_providers.py (re-exported from asr_client.py for backward compat). Update video.py router to await transcribe_full() and validate API key per provider (DASHSCOPE_API_KEY for dashscope, OPENROUTER_API_KEY for openrouter). Co-authored-by: Sisyphus --- backend/app/routers/video.py | 12 +++++-- backend/app/services/asr_client.py | 51 ++++++++---------------------- 2 files changed, 22 insertions(+), 41 deletions(-) diff --git a/backend/app/routers/video.py b/backend/app/routers/video.py index e2ce71e..644f783 100644 --- a/backend/app/routers/video.py +++ b/backend/app/routers/video.py @@ -94,14 +94,20 @@ async def transcribe_video(video_id: str, language: str = "yue"): from app.core.config import get_settings settings = get_settings() - if not settings.dashscope_api_key: + provider = settings.asr_provider + if provider == "dashscope" and not settings.dashscope_api_key: raise HTTPException( status_code=500, detail="DASHSCOPE_API_KEY is not configured. Set it in .env to enable transcription.", ) + if provider == "openrouter" and not settings.openrouter_api_key: + raise HTTPException( + status_code=500, + detail="OPENROUTER_API_KEY is not configured. Set it in .env to enable OpenRouter ASR.", + ) transcribe_start = time.monotonic() - logger.info("transcribe-started video_id=%s language=%s", video_id, language) + logger.info("transcribe-started video_id=%s language=%s provider=%s", video_id, language, provider) service = _get_video_service() wav_path = await service.extract_audio(video_id) @@ -110,7 +116,7 @@ async def transcribe_video(video_id: str, language: str = "yue"): audio_bytes = wav_path.read_bytes() logger.debug("audio-extracted video_id=%s wav_size=%d", video_id, len(audio_bytes)) asr = ASRClient(settings) - text = asr.transcribe_full(audio_bytes, language=language) + text = await asr.transcribe_full(audio_bytes, language=language) except Exception as e: logger.error("transcribe-failed video_id=%s error=%s", video_id, e) raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}") diff --git a/backend/app/services/asr_client.py b/backend/app/services/asr_client.py index e457d28..12b6ea0 100644 --- a/backend/app/services/asr_client.py +++ b/backend/app/services/asr_client.py @@ -1,9 +1,8 @@ import struct -import base64 import logging +from typing import Any -import zhconv -from openai import OpenAI +from app.services.asr_providers import create_asr_provider, ASRError, _to_traditional # noqa: F401 logger = logging.getLogger(__name__) @@ -20,40 +19,16 @@ def build_display_text(accumulated: str, current: str) -> str: return " ".join(parts) -def _to_traditional(text: str) -> str: - if not text: - return text - return zhconv.convert(text, "zh-hant") - - class ASRClient: - def __init__(self, settings): - self.settings = settings + def __init__(self, settings: Any): + self._settings = settings + self._provider = create_asr_provider(settings) - def transcribe_full(self, audio_bytes: bytes, language: str = "yue") -> str: - audio_b64 = base64.b64encode(audio_bytes).decode() - data_url = f"data:audio/wav;base64,{audio_b64}" - - client = OpenAI( - api_key=self.settings.dashscope_api_key, - base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1", - ) - - asr_options: dict = {} - if language != "auto": - asr_options["language"] = language - - resp = client.chat.completions.create( - model=self.settings.asr_model_name, - messages=[{ # type: ignore[list-item] - "role": "user", - "content": [{ - "type": "input_audio", - "input_audio": {"data": data_url}, - }], - }], - extra_body={"asr_options": asr_options} if asr_options else None, - ) - - result = resp.choices[0].message.content or "" - return _to_traditional(result) + async def transcribe_full(self, audio_bytes: bytes, language: str = "yue") -> str: + try: + return await self._provider.transcribe(audio_bytes, language) + except ASRError: + raise + except Exception as e: + logger.error("transcribe_full failed: %s", e) + raise ASRError(f"Transcription failed: {e}") from e