refactor: make ASR client and video router provider-aware
Refactor ASRClient to delegate to provider (DashScopeASRProvider or OpenRouterASRProvider) via create_asr_provider() factory. transcribe_full() now async. Move _to_traditional to asr_providers.py (re-exported from asr_client.py for backward compat). Update video.py router to await transcribe_full() and validate API key per provider (DASHSCOPE_API_KEY for dashscope, OPENROUTER_API_KEY for openrouter). Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
39525a2344
commit
183fcf7772
|
|
@ -94,14 +94,20 @@ async def transcribe_video(video_id: str, language: str = "yue"):
|
||||||
from app.core.config import get_settings
|
from app.core.config import get_settings
|
||||||
settings = get_settings()
|
settings = get_settings()
|
||||||
|
|
||||||
if not settings.dashscope_api_key:
|
provider = settings.asr_provider
|
||||||
|
if provider == "dashscope" and not settings.dashscope_api_key:
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=500,
|
status_code=500,
|
||||||
detail="DASHSCOPE_API_KEY is not configured. Set it in .env to enable transcription.",
|
detail="DASHSCOPE_API_KEY is not configured. Set it in .env to enable transcription.",
|
||||||
)
|
)
|
||||||
|
if provider == "openrouter" and not settings.openrouter_api_key:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail="OPENROUTER_API_KEY is not configured. Set it in .env to enable OpenRouter ASR.",
|
||||||
|
)
|
||||||
|
|
||||||
transcribe_start = time.monotonic()
|
transcribe_start = time.monotonic()
|
||||||
logger.info("transcribe-started video_id=%s language=%s", video_id, language)
|
logger.info("transcribe-started video_id=%s language=%s provider=%s", video_id, language, provider)
|
||||||
|
|
||||||
service = _get_video_service()
|
service = _get_video_service()
|
||||||
wav_path = await service.extract_audio(video_id)
|
wav_path = await service.extract_audio(video_id)
|
||||||
|
|
@ -110,7 +116,7 @@ async def transcribe_video(video_id: str, language: str = "yue"):
|
||||||
audio_bytes = wav_path.read_bytes()
|
audio_bytes = wav_path.read_bytes()
|
||||||
logger.debug("audio-extracted video_id=%s wav_size=%d", video_id, len(audio_bytes))
|
logger.debug("audio-extracted video_id=%s wav_size=%d", video_id, len(audio_bytes))
|
||||||
asr = ASRClient(settings)
|
asr = ASRClient(settings)
|
||||||
text = asr.transcribe_full(audio_bytes, language=language)
|
text = await asr.transcribe_full(audio_bytes, language=language)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("transcribe-failed video_id=%s error=%s", video_id, e)
|
logger.error("transcribe-failed video_id=%s error=%s", video_id, e)
|
||||||
raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
|
raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,8 @@
|
||||||
import struct
|
import struct
|
||||||
import base64
|
|
||||||
import logging
|
import logging
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
import zhconv
|
from app.services.asr_providers import create_asr_provider, ASRError, _to_traditional # noqa: F401
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -20,40 +19,16 @@ def build_display_text(accumulated: str, current: str) -> str:
|
||||||
return " ".join(parts)
|
return " ".join(parts)
|
||||||
|
|
||||||
|
|
||||||
def _to_traditional(text: str) -> str:
|
|
||||||
if not text:
|
|
||||||
return text
|
|
||||||
return zhconv.convert(text, "zh-hant")
|
|
||||||
|
|
||||||
|
|
||||||
class ASRClient:
|
class ASRClient:
|
||||||
def __init__(self, settings):
|
def __init__(self, settings: Any):
|
||||||
self.settings = settings
|
self._settings = settings
|
||||||
|
self._provider = create_asr_provider(settings)
|
||||||
|
|
||||||
def transcribe_full(self, audio_bytes: bytes, language: str = "yue") -> str:
|
async def transcribe_full(self, audio_bytes: bytes, language: str = "yue") -> str:
|
||||||
audio_b64 = base64.b64encode(audio_bytes).decode()
|
try:
|
||||||
data_url = f"data:audio/wav;base64,{audio_b64}"
|
return await self._provider.transcribe(audio_bytes, language)
|
||||||
|
except ASRError:
|
||||||
client = OpenAI(
|
raise
|
||||||
api_key=self.settings.dashscope_api_key,
|
except Exception as e:
|
||||||
base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
|
logger.error("transcribe_full failed: %s", e)
|
||||||
)
|
raise ASRError(f"Transcription failed: {e}") from e
|
||||||
|
|
||||||
asr_options: dict = {}
|
|
||||||
if language != "auto":
|
|
||||||
asr_options["language"] = language
|
|
||||||
|
|
||||||
resp = client.chat.completions.create(
|
|
||||||
model=self.settings.asr_model_name,
|
|
||||||
messages=[{ # type: ignore[list-item]
|
|
||||||
"role": "user",
|
|
||||||
"content": [{
|
|
||||||
"type": "input_audio",
|
|
||||||
"input_audio": {"data": data_url},
|
|
||||||
}],
|
|
||||||
}],
|
|
||||||
extra_body={"asr_options": asr_options} if asr_options else None,
|
|
||||||
)
|
|
||||||
|
|
||||||
result = resp.choices[0].message.content or ""
|
|
||||||
return _to_traditional(result)
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue