60 lines
1.7 KiB
Python
60 lines
1.7 KiB
Python
import struct
|
|
import base64
|
|
import logging
|
|
|
|
import zhconv
|
|
from openai import OpenAI
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def float32_to_s16le(float32_bytes: bytes) -> bytes:
|
|
num_samples = len(float32_bytes) // 4
|
|
floats = struct.unpack(f"<{num_samples}f", float32_bytes)
|
|
int16_samples = [max(-32768, min(32767, int(f * 32767.0))) for f in floats]
|
|
return struct.pack(f"<{num_samples}h", *int16_samples)
|
|
|
|
|
|
def build_display_text(accumulated: str, current: str) -> str:
|
|
parts = [p for p in (accumulated, current) if p and p.strip()]
|
|
return " ".join(parts)
|
|
|
|
|
|
def _to_traditional(text: str) -> str:
|
|
if not text:
|
|
return text
|
|
return zhconv.convert(text, "zh-hant")
|
|
|
|
|
|
class ASRClient:
|
|
def __init__(self, settings):
|
|
self.settings = settings
|
|
|
|
def transcribe_full(self, audio_bytes: bytes, language: str = "yue") -> str:
|
|
audio_b64 = base64.b64encode(audio_bytes).decode()
|
|
data_url = f"data:audio/wav;base64,{audio_b64}"
|
|
|
|
client = OpenAI(
|
|
api_key=self.settings.dashscope_api_key,
|
|
base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
|
|
)
|
|
|
|
asr_options: dict = {}
|
|
if language != "auto":
|
|
asr_options["language"] = language
|
|
|
|
resp = client.chat.completions.create(
|
|
model=self.settings.asr_model_name,
|
|
messages=[{ # type: ignore[list-item]
|
|
"role": "user",
|
|
"content": [{
|
|
"type": "input_audio",
|
|
"input_audio": {"data": data_url},
|
|
}],
|
|
}],
|
|
extra_body={"asr_options": asr_options} if asr_options else None,
|
|
)
|
|
|
|
result = resp.choices[0].message.content or ""
|
|
return _to_traditional(result)
|