From 9934749d2b47372d4d5ae379f7e5acf2bd38b005 Mon Sep 17 00:00:00 2001 From: Woody Date: Wed, 6 May 2026 13:08:19 +0800 Subject: [PATCH] feat: Phase 2.1 config + infrastructure and 2.2 video upload backend - Add DashScope ASR and video upload config fields to Settings - Create Pydantic models (video.py, asr.py) - Create VideoService with validation, save, serve, delete - Create ASR client stub with float32_to_s16le utility - Implement POST /api/v1/video/upload with streaming validation - Implement GET /api/v1/video/{video_id} with FileResponse - Create WebSocket ASR endpoint stub - Register new routers in main.py - Update .env.example and requirements.txt - Add reference examples for DashScope integration - 8 tests passing (3 config + 5 video upload) --- .examples/README.md | 49 +++ .examples/alibaba_asr_backend.py | 265 +++++++++++++++ .examples/alibaba_asr_frontend_react.tsx | 244 ++++++++++++++ .examples/alibaba_asr_frontend_vanilla.html | 159 +++++++++ .plans/phase2_implementation_plan.md | 321 +++++++++++++++++++ backend/.env.example | 10 + backend/app/core/config.py | 10 + backend/app/main.py | 4 +- backend/app/models/asr.py | 8 + backend/app/models/video.py | 21 ++ backend/app/routers/video.py | 77 +++++ backend/app/routers/ws_asr.py | 16 + backend/app/services/asr_client.py | 25 ++ backend/app/services/video_service.py | 35 ++ backend/app/test/test_phase2_config.py | 63 ++++ backend/app/test/test_phase2_video_upload.py | 83 ++++- backend/requirements.txt | 3 + 17 files changed, 1378 insertions(+), 15 deletions(-) create mode 100644 .examples/README.md create mode 100644 .examples/alibaba_asr_backend.py create mode 100644 .examples/alibaba_asr_frontend_react.tsx create mode 100644 .examples/alibaba_asr_frontend_vanilla.html create mode 100644 .plans/phase2_implementation_plan.md create mode 100644 backend/app/models/asr.py create mode 100644 backend/app/models/video.py create mode 100644 backend/app/routers/video.py create mode 100644 backend/app/routers/ws_asr.py create mode 100644 backend/app/services/asr_client.py create mode 100644 backend/app/services/video_service.py create mode 100644 backend/app/test/test_phase2_config.py diff --git a/.examples/README.md b/.examples/README.md new file mode 100644 index 0000000..727218c --- /dev/null +++ b/.examples/README.md @@ -0,0 +1,49 @@ +# Alibaba Cloud DashScope ASR — Reference Examples + +Adapted from `/mnt/c/Users/woody/Documents/projects/voice input/` (Cantonese voice-to-text web app). + +## Files + +| File | What | Language | +|------|------|----------| +| `alibaba_asr_backend.py` | FastAPI WebSocket proxy to DashScope realtime ASR | Python | +| `alibaba_asr_frontend_vanilla.html` | Browser audio capture + WebSocket (vanilla JS) | HTML/JS | +| `alibaba_asr_frontend_react.tsx` | React/TS hook + component for audio capture | TypeScript/React | + +## Architecture + +``` +Browser (Float32 PCM, 16kHz mono) + │ WebSocket: send(float32Data.buffer) + ▼ +FastAPI Backend (/ws/asr/{video_id}) + │ Convert Float32 → S16_LE → base64 + ▼ +Alibaba Cloud DashScope (wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime) + │ Model: qwen3-asr-flash-realtime + ▼ Language: yue (Cantonese) +Transcript JSON → Browser +``` + +## Key Details + +- **Audio format**: Float32 PCM, 16kHz, mono (browser) → S16_LE PCM, 16kHz, mono, base64 (DashScope) +- **Model**: `qwen3-asr-flash-realtime` (WebSocket realtime, unlimited duration) +- **Endpoint**: `wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime` +- **SDK**: `pip install dashscope>=0.4.0` +- **Cantonese**: Language code `yue` (works natively with DashScope) +- **VAD**: Server-side (Alibaba Cloud handles voice activity detection) +- **Pricing**: ~$0.00009/second +- **Features**: Punctuation, ITN, filler word filtering, multi-language auto-detect + +## Dependencies + +``` +# Python +dashscope>=0.4.0 +openai>=1.52.0 +zhconv>=1.4.0 # Simplified → Traditional Chinese (optional) + +# No additional JS deps needed — native Web APIs only: +# WebSocket, AudioContext, ScriptProcessorNode, getUserMedia +``` diff --git a/.examples/alibaba_asr_backend.py b/.examples/alibaba_asr_backend.py new file mode 100644 index 0000000..9351024 --- /dev/null +++ b/.examples/alibaba_asr_backend.py @@ -0,0 +1,265 @@ +""" +Reference: Alibaba Cloud DashScope Real-Time ASR Proxy (FastAPI WebSocket). + +Extracted and adapted from: + /mnt/c/Users/woody/Documents/projects/voice input/backend/app.py + +Architecture: + Browser (Float32 PCM) → FastAPI WebSocket → DashScope Real-Time WebSocket + API key NEVER leaves the server. Backend proxies audio to Alibaba Cloud. + +Key imports: + pip install dashscope>=0.4.0 openai>=1.52.0 zhconv +""" +import os +import json +import struct +import asyncio +import base64 +import logging + +from fastapi import FastAPI, WebSocket, WebSocketDisconnect +from dotenv import load_dotenv +import zhconv # Optional: simplified → traditional Chinese conversion + +from dashscope.audio.qwen_omni.omni_realtime import ( + OmniRealtimeConversation, + OmniRealtimeCallback, + TranscriptionParams, + MultiModality, +) + +load_dotenv() + +logger = logging.getLogger("asr-proxy") + +DASHSCOPE_API_KEY = os.getenv("DASHSCOPE_API_KEY") + +# ─── Audio Conversion: Float32 PCM → S16_LE ───────────────────────────────── + +def float32_to_s16le(float32_bytes: bytes) -> bytes: + """Convert browser Float32 PCM to S16_LE required by DashScope. + + Browser: Float32Array (values -1.0 to 1.0) → raw bytes + DashScope: S16_LE PCM 16kHz mono → base64 encoded + """ + num_samples = len(float32_bytes) // 4 + floats = struct.unpack(f"<{num_samples}f", float32_bytes) + int16_samples = [max(-32768, min(32767, int(f * 32767.0))) for f in floats] + return struct.pack(f"<{num_samples}h", *int16_samples) + + +def _to_traditional(text: str) -> str: + """Convert Simplified Chinese to Traditional (for Cantonese display).""" + if not text: + return text + return zhconv.convert(text, "zh-hant") + + +def build_display_text(accumulated: str, current: str) -> str: + """Assemble multi-utterance display text.""" + parts = [p for p in (accumulated, current) if p and p.strip()] + return " ".join(parts) + + +# ─── DashScope Callback Bridge (Sync → Async) ─────────────────────────────── + +class DashScopeCallback(OmniRealtimeCallback): + """Bridges sync DashScope SDK callbacks to async WebSocket messages. + + The DashScope SDK fires callbacks from a background thread. + We push events to an asyncio.Queue so the async task can read them. + """ + + def __init__(self, event_queue: asyncio.Queue, loop: asyncio.AbstractEventLoop): + super().__init__() + self._queue = event_queue + self._loop = loop + + def on_open(self): + logger.info("DashScope realtime connection opened") + + def on_event(self, message): + """Called from SDK background thread.""" + try: + event = json.loads(message) if isinstance(message, str) else message + self._loop.call_soon_threadsafe(self._queue.put_nowait, event) + except Exception as e: + logger.error(f"DashScope callback error: {e}") + + def on_close(self, code, msg): + logger.info(f"DashScope realtime closed: code={code}, msg={msg}") + + +# ─── WebSocket Proxy Handler ──────────────────────────────────────────────── + +async def ws_proxy_dashscope( + client_ws: WebSocket, + loop: asyncio.AbstractEventLoop, + language: str = "yue", # "yue" for Cantonese, "zh" for Mandarin, "en" for English +): + """Proxy browser audio to DashScope real-time ASR. + + Flow: + 1. Browser sends Float32 PCM bytes via WebSocket + 2. Backend converts to S16_LE → base64 + 3. Append audio to DashScope conversation + 4. DashScope SDK sends events → callback → asyncio.Queue + 5. Read events from queue → format as JSON → send to browser + + Protocol (backend → browser): + Partial: {"delta": "", "full_text": "...", "language": "yue", "is_final": false} + Final: {"delta": "", "full_text": "...", "language": "yue", "is_final": true} + """ + event_queue: asyncio.Queue = asyncio.Queue() + callback = DashScopeCallback(event_queue, loop) + + # Initialize DashScope real-time conversation + conversation = OmniRealtimeConversation( + model="qwen3-asr-flash-realtime", + url="wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime", + callback=callback, + ) + + # connect() is synchronous — run in executor to avoid blocking + await loop.run_in_executor(None, conversation.connect) + logger.info("dashscope-session-connected") + + # Configure session: text output + audio transcription + transcription_kwargs: dict = { + "sample_rate": 16000, + "input_audio_format": "pcm", + } + if language != "auto": + transcription_kwargs["language"] = language + + transcription_params = TranscriptionParams(**transcription_kwargs) + conversation.update_session( + output_modalities=[MultiModality.TEXT], + enable_input_audio_transcription=True, + transcription_params=transcription_params, + ) + logger.info("dashscope-session-updated lang=%s", language) + + accumulated_text = "" + current_lang = language + + async def read_events(): + """Async task: read DashScope events from queue → send to browser.""" + nonlocal accumulated_text, current_lang + while True: + event = await event_queue.get() + event_type = event.get("type", "") + + if event_type == "conversation.item.input_audio_transcription.text": + # Partial result (in-progress utterance) + stash = event.get("stash", "") + display = ( + build_display_text(accumulated_text, stash) + if stash else accumulated_text + ) + await client_ws.send_json({ + "delta": "", + "full_text": _to_traditional(display), + "language": event.get("language", current_lang), + "is_final": False, + }) + + elif event_type == "conversation.item.input_audio_transcription.completed": + # Final utterance completed + transcript = event.get("transcript", "") + if transcript and transcript.strip(): + accumulated_text = build_display_text(accumulated_text, transcript) + current_lang = event.get("language", current_lang) + logger.info( + "dashscope-utterance lang=%s text_len=%d accumulated_len=%d", + current_lang, len(transcript), len(accumulated_text), + ) + await client_ws.send_json({ + "delta": "", + "full_text": _to_traditional(accumulated_text), + "language": current_lang, + "is_final": True, + }) + + read_task = asyncio.create_task(read_events()) + + try: + # Main loop: receive Float32 PCM from browser → convert → send to DashScope + while True: + float32_bytes = await client_ws.receive_bytes() + s16_bytes = float32_to_s16le(float32_bytes) + audio_b64 = base64.b64encode(s16_bytes).decode("ascii") + conversation.append_audio(audio_b64) + except WebSocketDisconnect: + pass + finally: + read_task.cancel() + conversation.close() + logger.info( + "dashscope-session-closed text_len=%d", + len(accumulated_text), + ) + + +# ─── FastAPI App Setup ────────────────────────────────────────────────────── + +app = FastAPI() + +@app.websocket("/ws/asr/{video_id}") +async def ws_asr_endpoint(websocket: WebSocket, video_id: str, language: str = "yue"): + """WebSocket endpoint for real-time ASR. + + Query params: + language: "yue" (Cantonese), "zh" (Mandarin), "en" (English), "auto" + """ + await websocket.accept() + loop = asyncio.get_event_loop() + logger.info("ws-connect video_id=%s lang=%s", video_id, language) + + await ws_proxy_dashscope(websocket, loop, language) + + logger.info("ws-disconnect video_id=%s", video_id) + + +# ─── Non-Streaming Fallback (POST) ────────────────────────────────────────── + +from fastapi import UploadFile, File +from openai import OpenAI + +# Sync client for non-streaming fallback +sync_client = OpenAI( + api_key=DASHSCOPE_API_KEY, + base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1", +) + +@app.post("/api/v1/asr/transcribe") +async def transcribe_file(file: UploadFile = File(...), language: str = "yue"): + """Non-streaming fallback: transcribe an uploaded audio file.""" + audio = await file.read() + + # Build data URL: data:;base64, + audio_b64 = base64.b64encode(audio).decode() + data_url = f"data:;base64,{audio_b64}" + + resp = sync_client.chat.completions.create( + model="qwen3-asr-flash", + messages=[{ + "role": "user", + "content": [{ + "type": "input_audio", + "input_audio": {"data": data_url}, + }], + }], + extra_body={ + "asr_options": { + "language": language if language != "auto" else None, + } + }, + ) + + result = resp.choices[0].message.content + return { + "text": _to_traditional(result), + "language": language, + } diff --git a/.examples/alibaba_asr_frontend_react.tsx b/.examples/alibaba_asr_frontend_react.tsx new file mode 100644 index 0000000..6c34b1d --- /dev/null +++ b/.examples/alibaba_asr_frontend_react.tsx @@ -0,0 +1,244 @@ +/** + * Reference: React/TypeScript adaptation of VIDEO AUDIO capture for Alibaba Cloud ASR. + * + * Two modes: + * A. Streaming (real-time): capture