import struct import base64 import logging import zhconv from openai import OpenAI logger = logging.getLogger(__name__) def float32_to_s16le(float32_bytes: bytes) -> bytes: num_samples = len(float32_bytes) // 4 floats = struct.unpack(f"<{num_samples}f", float32_bytes) int16_samples = [max(-32768, min(32767, int(f * 32767.0))) for f in floats] return struct.pack(f"<{num_samples}h", *int16_samples) def build_display_text(accumulated: str, current: str) -> str: parts = [p for p in (accumulated, current) if p and p.strip()] return " ".join(parts) def _to_traditional(text: str) -> str: if not text: return text return zhconv.convert(text, "zh-hant") class ASRClient: def __init__(self, settings): self.settings = settings def transcribe_full(self, audio_bytes: bytes, language: str = "yue") -> str: audio_b64 = base64.b64encode(audio_bytes).decode() data_url = f"data:audio/wav;base64,{audio_b64}" client = OpenAI( api_key=self.settings.dashscope_api_key, base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1", ) asr_options: dict = {} if language != "auto": asr_options["language"] = language resp = client.chat.completions.create( model=self.settings.asr_model_name, messages=[{ # type: ignore[list-item] "role": "user", "content": [{ "type": "input_audio", "input_audio": {"data": data_url}, }], }], extra_body={"asr_options": asr_options} if asr_options else None, ) result = resp.choices[0].message.content or "" return _to_traditional(result)