feat: Phase 2.1 config + infrastructure and 2.2 video upload backend
- Add DashScope ASR and video upload config fields to Settings
- Create Pydantic models (video.py, asr.py)
- Create VideoService with validation, save, serve, delete
- Create ASR client stub with float32_to_s16le utility
- Implement POST /api/v1/video/upload with streaming validation
- Implement GET /api/v1/video/{video_id} with FileResponse
- Create WebSocket ASR endpoint stub
- Register new routers in main.py
- Update .env.example and requirements.txt
- Add reference examples for DashScope integration
- 8 tests passing (3 config + 5 video upload)
This commit is contained in:
parent
63e4c1a385
commit
9934749d2b
|
|
@ -0,0 +1,49 @@
|
|||
# Alibaba Cloud DashScope ASR — Reference Examples
|
||||
|
||||
Adapted from `/mnt/c/Users/woody/Documents/projects/voice input/` (Cantonese voice-to-text web app).
|
||||
|
||||
## Files
|
||||
|
||||
| File | What | Language |
|
||||
|------|------|----------|
|
||||
| `alibaba_asr_backend.py` | FastAPI WebSocket proxy to DashScope realtime ASR | Python |
|
||||
| `alibaba_asr_frontend_vanilla.html` | Browser audio capture + WebSocket (vanilla JS) | HTML/JS |
|
||||
| `alibaba_asr_frontend_react.tsx` | React/TS hook + component for audio capture | TypeScript/React |
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
Browser (Float32 PCM, 16kHz mono)
|
||||
│ WebSocket: send(float32Data.buffer)
|
||||
▼
|
||||
FastAPI Backend (/ws/asr/{video_id})
|
||||
│ Convert Float32 → S16_LE → base64
|
||||
▼
|
||||
Alibaba Cloud DashScope (wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime)
|
||||
│ Model: qwen3-asr-flash-realtime
|
||||
▼ Language: yue (Cantonese)
|
||||
Transcript JSON → Browser
|
||||
```
|
||||
|
||||
## Key Details
|
||||
|
||||
- **Audio format**: Float32 PCM, 16kHz, mono (browser) → S16_LE PCM, 16kHz, mono, base64 (DashScope)
|
||||
- **Model**: `qwen3-asr-flash-realtime` (WebSocket realtime, unlimited duration)
|
||||
- **Endpoint**: `wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime`
|
||||
- **SDK**: `pip install dashscope>=0.4.0`
|
||||
- **Cantonese**: Language code `yue` (works natively with DashScope)
|
||||
- **VAD**: Server-side (Alibaba Cloud handles voice activity detection)
|
||||
- **Pricing**: ~$0.00009/second
|
||||
- **Features**: Punctuation, ITN, filler word filtering, multi-language auto-detect
|
||||
|
||||
## Dependencies
|
||||
|
||||
```
|
||||
# Python
|
||||
dashscope>=0.4.0
|
||||
openai>=1.52.0
|
||||
zhconv>=1.4.0 # Simplified → Traditional Chinese (optional)
|
||||
|
||||
# No additional JS deps needed — native Web APIs only:
|
||||
# WebSocket, AudioContext, ScriptProcessorNode, getUserMedia
|
||||
```
|
||||
|
|
@ -0,0 +1,265 @@
|
|||
"""
|
||||
Reference: Alibaba Cloud DashScope Real-Time ASR Proxy (FastAPI WebSocket).
|
||||
|
||||
Extracted and adapted from:
|
||||
/mnt/c/Users/woody/Documents/projects/voice input/backend/app.py
|
||||
|
||||
Architecture:
|
||||
Browser (Float32 PCM) → FastAPI WebSocket → DashScope Real-Time WebSocket
|
||||
API key NEVER leaves the server. Backend proxies audio to Alibaba Cloud.
|
||||
|
||||
Key imports:
|
||||
pip install dashscope>=0.4.0 openai>=1.52.0 zhconv
|
||||
"""
|
||||
import os
|
||||
import json
|
||||
import struct
|
||||
import asyncio
|
||||
import base64
|
||||
import logging
|
||||
|
||||
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
||||
from dotenv import load_dotenv
|
||||
import zhconv # Optional: simplified → traditional Chinese conversion
|
||||
|
||||
from dashscope.audio.qwen_omni.omni_realtime import (
|
||||
OmniRealtimeConversation,
|
||||
OmniRealtimeCallback,
|
||||
TranscriptionParams,
|
||||
MultiModality,
|
||||
)
|
||||
|
||||
load_dotenv()
|
||||
|
||||
logger = logging.getLogger("asr-proxy")
|
||||
|
||||
DASHSCOPE_API_KEY = os.getenv("DASHSCOPE_API_KEY")
|
||||
|
||||
# ─── Audio Conversion: Float32 PCM → S16_LE ─────────────────────────────────
|
||||
|
||||
def float32_to_s16le(float32_bytes: bytes) -> bytes:
|
||||
"""Convert browser Float32 PCM to S16_LE required by DashScope.
|
||||
|
||||
Browser: Float32Array (values -1.0 to 1.0) → raw bytes
|
||||
DashScope: S16_LE PCM 16kHz mono → base64 encoded
|
||||
"""
|
||||
num_samples = len(float32_bytes) // 4
|
||||
floats = struct.unpack(f"<{num_samples}f", float32_bytes)
|
||||
int16_samples = [max(-32768, min(32767, int(f * 32767.0))) for f in floats]
|
||||
return struct.pack(f"<{num_samples}h", *int16_samples)
|
||||
|
||||
|
||||
def _to_traditional(text: str) -> str:
|
||||
"""Convert Simplified Chinese to Traditional (for Cantonese display)."""
|
||||
if not text:
|
||||
return text
|
||||
return zhconv.convert(text, "zh-hant")
|
||||
|
||||
|
||||
def build_display_text(accumulated: str, current: str) -> str:
|
||||
"""Assemble multi-utterance display text."""
|
||||
parts = [p for p in (accumulated, current) if p and p.strip()]
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
# ─── DashScope Callback Bridge (Sync → Async) ───────────────────────────────
|
||||
|
||||
class DashScopeCallback(OmniRealtimeCallback):
|
||||
"""Bridges sync DashScope SDK callbacks to async WebSocket messages.
|
||||
|
||||
The DashScope SDK fires callbacks from a background thread.
|
||||
We push events to an asyncio.Queue so the async task can read them.
|
||||
"""
|
||||
|
||||
def __init__(self, event_queue: asyncio.Queue, loop: asyncio.AbstractEventLoop):
|
||||
super().__init__()
|
||||
self._queue = event_queue
|
||||
self._loop = loop
|
||||
|
||||
def on_open(self):
|
||||
logger.info("DashScope realtime connection opened")
|
||||
|
||||
def on_event(self, message):
|
||||
"""Called from SDK background thread."""
|
||||
try:
|
||||
event = json.loads(message) if isinstance(message, str) else message
|
||||
self._loop.call_soon_threadsafe(self._queue.put_nowait, event)
|
||||
except Exception as e:
|
||||
logger.error(f"DashScope callback error: {e}")
|
||||
|
||||
def on_close(self, code, msg):
|
||||
logger.info(f"DashScope realtime closed: code={code}, msg={msg}")
|
||||
|
||||
|
||||
# ─── WebSocket Proxy Handler ────────────────────────────────────────────────
|
||||
|
||||
async def ws_proxy_dashscope(
|
||||
client_ws: WebSocket,
|
||||
loop: asyncio.AbstractEventLoop,
|
||||
language: str = "yue", # "yue" for Cantonese, "zh" for Mandarin, "en" for English
|
||||
):
|
||||
"""Proxy browser audio to DashScope real-time ASR.
|
||||
|
||||
Flow:
|
||||
1. Browser sends Float32 PCM bytes via WebSocket
|
||||
2. Backend converts to S16_LE → base64
|
||||
3. Append audio to DashScope conversation
|
||||
4. DashScope SDK sends events → callback → asyncio.Queue
|
||||
5. Read events from queue → format as JSON → send to browser
|
||||
|
||||
Protocol (backend → browser):
|
||||
Partial: {"delta": "", "full_text": "...", "language": "yue", "is_final": false}
|
||||
Final: {"delta": "", "full_text": "...", "language": "yue", "is_final": true}
|
||||
"""
|
||||
event_queue: asyncio.Queue = asyncio.Queue()
|
||||
callback = DashScopeCallback(event_queue, loop)
|
||||
|
||||
# Initialize DashScope real-time conversation
|
||||
conversation = OmniRealtimeConversation(
|
||||
model="qwen3-asr-flash-realtime",
|
||||
url="wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime",
|
||||
callback=callback,
|
||||
)
|
||||
|
||||
# connect() is synchronous — run in executor to avoid blocking
|
||||
await loop.run_in_executor(None, conversation.connect)
|
||||
logger.info("dashscope-session-connected")
|
||||
|
||||
# Configure session: text output + audio transcription
|
||||
transcription_kwargs: dict = {
|
||||
"sample_rate": 16000,
|
||||
"input_audio_format": "pcm",
|
||||
}
|
||||
if language != "auto":
|
||||
transcription_kwargs["language"] = language
|
||||
|
||||
transcription_params = TranscriptionParams(**transcription_kwargs)
|
||||
conversation.update_session(
|
||||
output_modalities=[MultiModality.TEXT],
|
||||
enable_input_audio_transcription=True,
|
||||
transcription_params=transcription_params,
|
||||
)
|
||||
logger.info("dashscope-session-updated lang=%s", language)
|
||||
|
||||
accumulated_text = ""
|
||||
current_lang = language
|
||||
|
||||
async def read_events():
|
||||
"""Async task: read DashScope events from queue → send to browser."""
|
||||
nonlocal accumulated_text, current_lang
|
||||
while True:
|
||||
event = await event_queue.get()
|
||||
event_type = event.get("type", "")
|
||||
|
||||
if event_type == "conversation.item.input_audio_transcription.text":
|
||||
# Partial result (in-progress utterance)
|
||||
stash = event.get("stash", "")
|
||||
display = (
|
||||
build_display_text(accumulated_text, stash)
|
||||
if stash else accumulated_text
|
||||
)
|
||||
await client_ws.send_json({
|
||||
"delta": "",
|
||||
"full_text": _to_traditional(display),
|
||||
"language": event.get("language", current_lang),
|
||||
"is_final": False,
|
||||
})
|
||||
|
||||
elif event_type == "conversation.item.input_audio_transcription.completed":
|
||||
# Final utterance completed
|
||||
transcript = event.get("transcript", "")
|
||||
if transcript and transcript.strip():
|
||||
accumulated_text = build_display_text(accumulated_text, transcript)
|
||||
current_lang = event.get("language", current_lang)
|
||||
logger.info(
|
||||
"dashscope-utterance lang=%s text_len=%d accumulated_len=%d",
|
||||
current_lang, len(transcript), len(accumulated_text),
|
||||
)
|
||||
await client_ws.send_json({
|
||||
"delta": "",
|
||||
"full_text": _to_traditional(accumulated_text),
|
||||
"language": current_lang,
|
||||
"is_final": True,
|
||||
})
|
||||
|
||||
read_task = asyncio.create_task(read_events())
|
||||
|
||||
try:
|
||||
# Main loop: receive Float32 PCM from browser → convert → send to DashScope
|
||||
while True:
|
||||
float32_bytes = await client_ws.receive_bytes()
|
||||
s16_bytes = float32_to_s16le(float32_bytes)
|
||||
audio_b64 = base64.b64encode(s16_bytes).decode("ascii")
|
||||
conversation.append_audio(audio_b64)
|
||||
except WebSocketDisconnect:
|
||||
pass
|
||||
finally:
|
||||
read_task.cancel()
|
||||
conversation.close()
|
||||
logger.info(
|
||||
"dashscope-session-closed text_len=%d",
|
||||
len(accumulated_text),
|
||||
)
|
||||
|
||||
|
||||
# ─── FastAPI App Setup ──────────────────────────────────────────────────────
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
@app.websocket("/ws/asr/{video_id}")
|
||||
async def ws_asr_endpoint(websocket: WebSocket, video_id: str, language: str = "yue"):
|
||||
"""WebSocket endpoint for real-time ASR.
|
||||
|
||||
Query params:
|
||||
language: "yue" (Cantonese), "zh" (Mandarin), "en" (English), "auto"
|
||||
"""
|
||||
await websocket.accept()
|
||||
loop = asyncio.get_event_loop()
|
||||
logger.info("ws-connect video_id=%s lang=%s", video_id, language)
|
||||
|
||||
await ws_proxy_dashscope(websocket, loop, language)
|
||||
|
||||
logger.info("ws-disconnect video_id=%s", video_id)
|
||||
|
||||
|
||||
# ─── Non-Streaming Fallback (POST) ──────────────────────────────────────────
|
||||
|
||||
from fastapi import UploadFile, File
|
||||
from openai import OpenAI
|
||||
|
||||
# Sync client for non-streaming fallback
|
||||
sync_client = OpenAI(
|
||||
api_key=DASHSCOPE_API_KEY,
|
||||
base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
|
||||
)
|
||||
|
||||
@app.post("/api/v1/asr/transcribe")
|
||||
async def transcribe_file(file: UploadFile = File(...), language: str = "yue"):
|
||||
"""Non-streaming fallback: transcribe an uploaded audio file."""
|
||||
audio = await file.read()
|
||||
|
||||
# Build data URL: data:;base64,<base64_audio>
|
||||
audio_b64 = base64.b64encode(audio).decode()
|
||||
data_url = f"data:;base64,{audio_b64}"
|
||||
|
||||
resp = sync_client.chat.completions.create(
|
||||
model="qwen3-asr-flash",
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [{
|
||||
"type": "input_audio",
|
||||
"input_audio": {"data": data_url},
|
||||
}],
|
||||
}],
|
||||
extra_body={
|
||||
"asr_options": {
|
||||
"language": language if language != "auto" else None,
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
result = resp.choices[0].message.content
|
||||
return {
|
||||
"text": _to_traditional(result),
|
||||
"language": language,
|
||||
}
|
||||
|
|
@ -0,0 +1,244 @@
|
|||
/**
|
||||
* Reference: React/TypeScript adaptation of VIDEO AUDIO capture for Alibaba Cloud ASR.
|
||||
*
|
||||
* Two modes:
|
||||
* A. Streaming (real-time): capture <video> element's audio during playback
|
||||
* → WebSocket → backend → DashScope realtime → transcript in QueryInput
|
||||
* B. Full Transcript (batch): POST /api/v1/video/{id}/transcribe
|
||||
* → backend extracts audio via ffmpeg → DashScope non-streaming → full transcript
|
||||
*
|
||||
* Audio pipeline (streaming mode):
|
||||
* <video> element → AudioContext.createMediaElementSource(video)
|
||||
* → ScriptProcessor(4096, 1, 1) → Float32Array
|
||||
* → WebSocket.send(float32Data.buffer) → Backend → DashScope
|
||||
*
|
||||
* IMPORTANT:
|
||||
* - processor.connect(audioContext.destination) so audio still plays through speakers
|
||||
* - No getUserMedia() needed (no microphone permission)
|
||||
* - Full Transcript mode uses backend ffmpeg to extract audio server-side
|
||||
*/
|
||||
|
||||
import { useState, useRef, useCallback, useEffect } from 'react';
|
||||
|
||||
// ─── Types ──────────────────────────────────────────────────────────────────
|
||||
|
||||
interface ASRMessage {
|
||||
delta: string;
|
||||
full_text: string;
|
||||
language: string;
|
||||
is_final: boolean;
|
||||
}
|
||||
|
||||
type ASRStatus = 'idle' | 'connecting' | 'streaming' | 'disconnected' | 'error';
|
||||
|
||||
interface UseVideoASROptions {
|
||||
videoId: string;
|
||||
videoElement: HTMLVideoElement | null;
|
||||
language?: string; // "yue" | "zh" | "en" | "auto"
|
||||
onFinalTranscript?: (text: string) => void;
|
||||
}
|
||||
|
||||
// ─── Hook: useVideoASR (Streaming Mode) ─────────────────────────────────────
|
||||
|
||||
export function useVideoASR({
|
||||
videoId,
|
||||
videoElement,
|
||||
language = 'yue',
|
||||
onFinalTranscript,
|
||||
}: UseVideoASROptions) {
|
||||
const [transcript, setTranscript] = useState('');
|
||||
const [partialTranscript, setPartialTranscript] = useState('');
|
||||
const [status, setStatus] = useState<ASRStatus>('idle');
|
||||
const [isStreaming, setIsStreaming] = useState(false);
|
||||
|
||||
const wsRef = useRef<WebSocket | null>(null);
|
||||
const audioContextRef = useRef<AudioContext | null>(null);
|
||||
const processorRef = useRef<ScriptProcessorNode | null>(null);
|
||||
const sourceRef = useRef<MediaElementAudioSourceNode | null>(null);
|
||||
const isStreamingRef = useRef(false);
|
||||
|
||||
const getWSURL = useCallback(() => {
|
||||
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
|
||||
const host = window.location.host;
|
||||
const langParam = language !== 'auto' ? `?language=${language}` : '';
|
||||
return `${protocol}//${host}/ws/asr/${videoId}${langParam}`;
|
||||
}, [videoId, language]);
|
||||
|
||||
const startStreaming = useCallback(() => {
|
||||
if (!videoElement) {
|
||||
console.error('No video element available');
|
||||
return;
|
||||
}
|
||||
try {
|
||||
setStatus('connecting');
|
||||
|
||||
const audioContext = new AudioContext({ sampleRate: 16000 });
|
||||
audioContextRef.current = audioContext;
|
||||
|
||||
const source = audioContext.createMediaElementSource(videoElement);
|
||||
sourceRef.current = source;
|
||||
|
||||
const processor = audioContext.createScriptProcessor(4096, 1, 1);
|
||||
processorRef.current = processor;
|
||||
|
||||
const ws = new WebSocket(getWSURL());
|
||||
wsRef.current = ws;
|
||||
|
||||
ws.onopen = () => {
|
||||
isStreamingRef.current = true;
|
||||
setIsStreaming(true);
|
||||
setStatus('streaming');
|
||||
};
|
||||
|
||||
ws.onmessage = (e) => {
|
||||
const msg: ASRMessage = JSON.parse(e.data);
|
||||
setTranscript(msg.full_text);
|
||||
setPartialTranscript(msg.is_final ? '' : msg.full_text);
|
||||
if (msg.is_final && msg.full_text.trim()) {
|
||||
onFinalTranscript?.(msg.full_text);
|
||||
}
|
||||
};
|
||||
|
||||
ws.onerror = () => setStatus('error');
|
||||
ws.onclose = () => {
|
||||
isStreamingRef.current = false;
|
||||
setIsStreaming(false);
|
||||
setStatus('disconnected');
|
||||
};
|
||||
|
||||
processor.onaudioprocess = (e) => {
|
||||
if (!isStreamingRef.current) return;
|
||||
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return;
|
||||
const float32Data = e.inputBuffer.getChannelData(0);
|
||||
wsRef.current.send(float32Data.buffer);
|
||||
};
|
||||
|
||||
source.connect(processor);
|
||||
processor.connect(audioContext.destination);
|
||||
|
||||
} catch (err) {
|
||||
console.error('Video audio capture failed:', err);
|
||||
setStatus('error');
|
||||
}
|
||||
}, [videoElement, getWSURL, onFinalTranscript]);
|
||||
|
||||
const stopStreaming = useCallback(() => {
|
||||
isStreamingRef.current = false;
|
||||
setIsStreaming(false);
|
||||
processorRef.current?.disconnect();
|
||||
processorRef.current = null;
|
||||
sourceRef.current?.disconnect();
|
||||
sourceRef.current = null;
|
||||
wsRef.current?.close();
|
||||
wsRef.current = null;
|
||||
audioContextRef.current?.close();
|
||||
audioContextRef.current = null;
|
||||
setStatus('idle');
|
||||
setPartialTranscript('');
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
isStreamingRef.current = false;
|
||||
processorRef.current?.disconnect();
|
||||
sourceRef.current?.disconnect();
|
||||
wsRef.current?.close();
|
||||
audioContextRef.current?.close();
|
||||
};
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
if (!videoElement) return;
|
||||
const onPlay = () => startStreaming();
|
||||
const onPause = () => stopStreaming();
|
||||
const onEnded = () => stopStreaming();
|
||||
videoElement.addEventListener('play', onPlay);
|
||||
videoElement.addEventListener('pause', onPause);
|
||||
videoElement.addEventListener('ended', onEnded);
|
||||
return () => {
|
||||
videoElement.removeEventListener('play', onPlay);
|
||||
videoElement.removeEventListener('pause', onPause);
|
||||
videoElement.removeEventListener('ended', onEnded);
|
||||
};
|
||||
}, [videoElement, startStreaming, stopStreaming]);
|
||||
|
||||
return {
|
||||
transcript,
|
||||
partialTranscript,
|
||||
status,
|
||||
isStreaming,
|
||||
startStreaming,
|
||||
stopStreaming,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
// ─── Hook: useFullTranscript (Batch Mode) ───────────────────────────────────
|
||||
|
||||
interface UseFullTranscriptOptions {
|
||||
videoId: string;
|
||||
}
|
||||
|
||||
export function useFullTranscript({ videoId }: UseFullTranscriptOptions) {
|
||||
const [fullTranscript, setFullTranscript] = useState('');
|
||||
const [isLoading, setIsLoading] = useState(false);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
const requestFullTranscript = useCallback(async () => {
|
||||
setIsLoading(true);
|
||||
setError(null);
|
||||
try {
|
||||
const resp = await fetch(`/api/v1/video/${videoId}/transcribe`, {
|
||||
method: 'POST',
|
||||
});
|
||||
if (!resp.ok) {
|
||||
throw new Error(`Server returned ${resp.status}`);
|
||||
}
|
||||
const data = await resp.json();
|
||||
setFullTranscript(data.text);
|
||||
return data.text;
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : 'Transcription failed';
|
||||
setError(msg);
|
||||
return null;
|
||||
} finally {
|
||||
setIsLoading(false);
|
||||
}
|
||||
}, [videoId]);
|
||||
|
||||
return { fullTranscript, isLoading, error, requestFullTranscript };
|
||||
}
|
||||
|
||||
|
||||
// ─── Usage Example (in LTTPage.tsx) ─────────────────────────────────────────
|
||||
//
|
||||
// const videoRef = useRef<HTMLVideoElement>(null);
|
||||
// const [currentVideoId, setCurrentVideoId] = useState<string | null>(null);
|
||||
//
|
||||
// // Streaming ASR (auto-starts on video play, stops on pause/end)
|
||||
// const asr = useVideoASR({
|
||||
// videoId: currentVideoId ?? '',
|
||||
// videoElement: videoRef.current,
|
||||
// language: 'yue',
|
||||
// onFinalTranscript: (text) => {
|
||||
// setQueryText(text); // into QueryInput
|
||||
// },
|
||||
// });
|
||||
//
|
||||
// // Full Transcript (manual button)
|
||||
// const ft = useFullTranscript({ videoId: currentVideoId ?? '' });
|
||||
//
|
||||
// return (
|
||||
// <>
|
||||
// <video ref={videoRef} src={videoUrl} controls />
|
||||
// <button onClick={ft.requestFullTranscript} disabled={ft.isLoading}>
|
||||
// {ft.isLoading ? 'Transcribing...' : 'Full Transcript'}
|
||||
// </button>
|
||||
// <QueryInput
|
||||
// value={queryText}
|
||||
// onChange={setQueryText}
|
||||
// onSubmit={handleQuerySubmit}
|
||||
// partialText={asr.partialTranscript}
|
||||
// />
|
||||
// </>
|
||||
// );
|
||||
|
|
@ -0,0 +1,159 @@
|
|||
<!--
|
||||
Reference: Browser Audio Capture + WebSocket Streaming for Alibaba Cloud ASR.
|
||||
|
||||
Extracted from: /mnt/c/Users/woody/Documents/projects/voice input/backend/static/index.html
|
||||
|
||||
Architecture:
|
||||
Browser mic → AudioContext (16kHz mono) → ScriptProcessor → Float32Array
|
||||
→ WebSocket.send(float32Data.buffer) → FastAPI → DashScope
|
||||
|
||||
Key points:
|
||||
- Must use HTTPS/WSS (Chrome blocks getUserMedia on HTTP)
|
||||
- AudioContext sampleRate MUST be 16000
|
||||
- ScriptProcessor buffer size: 4096 (lower = more frequent sends, lower latency)
|
||||
- Send raw Float32 bytes (NOT base64, NOT WAV) to our backend WebSocket
|
||||
- Backend handles Float32 → S16_LE → base64 conversion
|
||||
- Language query param: ?language=yue (Cantonese), zh, en, auto
|
||||
-->
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh-HK">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>ASR Reference - Audio Capture</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Alibaba Cloud ASR - Audio Capture Pattern</h1>
|
||||
|
||||
<!-- Language selector -->
|
||||
<select id="langSelect">
|
||||
<option value="auto">Auto-Detect</option>
|
||||
<option value="en">English</option>
|
||||
<option value="zh">Mandarin</option>
|
||||
<option value="yue" selected>Cantonese</option>
|
||||
</select>
|
||||
|
||||
<!-- Record toggle button -->
|
||||
<button id="recordBtn">Start Recording</button>
|
||||
|
||||
<!-- Status indicator -->
|
||||
<div id="status">Ready</div>
|
||||
|
||||
<!-- Transcript display -->
|
||||
<div id="transcript"></div>
|
||||
|
||||
<script>
|
||||
// ─── Configuration ──────────────────────────────────────────────────────────
|
||||
const WS_PATH = '/ws/asr/session-1'; // video_id from URL or session
|
||||
const WS_BASE = `${location.protocol === 'https:' ? 'wss:' : 'ws:'}//${location.host}${WS_PATH}`;
|
||||
|
||||
// ─── State ──────────────────────────────────────────────────────────────────
|
||||
let ws = null;
|
||||
let audioContext = null;
|
||||
let processor = null; // ScriptProcessorNode
|
||||
let stream = null; // MediaStream
|
||||
let isRecording = false;
|
||||
|
||||
// ─── DOM Refs ───────────────────────────────────────────────────────────────
|
||||
const recordBtn = document.getElementById('recordBtn');
|
||||
const langSelect = document.getElementById('langSelect');
|
||||
const statusEl = document.getElementById('status');
|
||||
const transcriptEl = document.getElementById('transcript');
|
||||
|
||||
// ─── WebSocket URL Builder ──────────────────────────────────────────────────
|
||||
function getWSURL() {
|
||||
const lang = langSelect.value;
|
||||
return lang !== 'auto' ? `${WS_BASE}?language=${lang}` : WS_BASE;
|
||||
}
|
||||
|
||||
// ─── Status Helper ──────────────────────────────────────────────────────────
|
||||
function setStatus(text) {
|
||||
statusEl.textContent = text;
|
||||
}
|
||||
|
||||
// ─── Audio Capture Setup ────────────────────────────────────────────────────
|
||||
async function startRecording() {
|
||||
try {
|
||||
// Step 1: Get microphone access
|
||||
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
|
||||
// Step 2: Create AudioContext at EXACTLY 16kHz (required by DashScope)
|
||||
audioContext = new AudioContext({ sampleRate: 16000 });
|
||||
|
||||
// Step 3: Create media stream source
|
||||
const source = audioContext.createMediaStreamSource(stream);
|
||||
|
||||
// Step 4: Create ScriptProcessor for raw PCM access
|
||||
// Buffer size 4096 → ~256ms chunks at 16kHz (lower = more responsive)
|
||||
processor = audioContext.createScriptProcessor(4096, 1, 1);
|
||||
|
||||
// Step 5: Connect WebSocket
|
||||
ws = new WebSocket(getWSURL());
|
||||
|
||||
ws.onopen = () => {
|
||||
isRecording = true;
|
||||
recordBtn.textContent = 'Stop Recording';
|
||||
setStatus('Listening...');
|
||||
};
|
||||
|
||||
ws.onmessage = (e) => {
|
||||
const { full_text, language, is_final } = JSON.parse(e.data);
|
||||
// Display transcript — in our React app, this goes into QueryInput
|
||||
transcriptEl.textContent = full_text || '';
|
||||
};
|
||||
|
||||
ws.onerror = () => setStatus('WebSocket error');
|
||||
ws.onclose = () => {
|
||||
isRecording = false;
|
||||
recordBtn.textContent = 'Start Recording';
|
||||
setStatus('Disconnected');
|
||||
};
|
||||
|
||||
// Step 6: Audio processing callback — fires every ~256ms
|
||||
processor.onaudioprocess = (e) => {
|
||||
// Get raw Float32 samples from input channel 0
|
||||
const float32Data = e.inputBuffer.getChannelData(0);
|
||||
|
||||
// Send raw Float32 bytes to backend (NOT base64, NOT WAV)
|
||||
// The .buffer property gives us an ArrayBuffer of the Float32Array
|
||||
if (ws && ws.readyState === WebSocket.OPEN && isRecording) {
|
||||
ws.send(float32Data.buffer);
|
||||
}
|
||||
};
|
||||
|
||||
// Step 7: Connect the audio graph
|
||||
source.connect(processor);
|
||||
processor.connect(audioContext.destination); // Required: prevents auto-mute
|
||||
|
||||
} catch (err) {
|
||||
setStatus('Microphone access denied');
|
||||
console.error('Audio capture error:', err);
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Stop Recording ─────────────────────────────────────────────────────────
|
||||
function stopRecording() {
|
||||
isRecording = false;
|
||||
|
||||
// Clean up audio graph
|
||||
processor?.disconnect();
|
||||
stream?.getTracks().forEach(t => t.stop());
|
||||
|
||||
// Close WebSocket
|
||||
ws?.close();
|
||||
|
||||
recordBtn.textContent = 'Start Recording';
|
||||
setStatus('Ready');
|
||||
}
|
||||
|
||||
// ─── Toggle Recording ───────────────────────────────────────────────────────
|
||||
recordBtn.addEventListener('click', () => {
|
||||
if (isRecording) {
|
||||
stopRecording();
|
||||
} else {
|
||||
startRecording();
|
||||
}
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
|
|
@ -0,0 +1,321 @@
|
|||
# Phase 2: Video Upload + Video Audio ASR → RAG — Implementation Plan
|
||||
|
||||
**Created:** 2026-05-06
|
||||
**Updated:** 2026-05-06 (video audio capture via createMediaElementSource; Full Transcript batch mode)
|
||||
**Status:** Planning — Not Started
|
||||
**Depends on:** Phase 1 (Complete)
|
||||
|
||||
---
|
||||
|
||||
## 1. Overview
|
||||
|
||||
Phase 2 adds video upload/playback and ASR transcription of the **video's audio track** (not microphone). When the video plays, browser captures the video audio output and streams it to Alibaba Cloud DashScope for real-time transcription. A "Full Transcript" button sends the complete video audio for batch (non-streaming) transcription via backend ffmpeg extraction.
|
||||
|
||||
### Two ASR Modes
|
||||
|
||||
**Mode A — Streaming (real-time, auto on play):**
|
||||
```
|
||||
<video> → AudioContext.createMediaElementSource(video)
|
||||
→ ScriptProcessor(4096, 1, 1) → Float32 PCM
|
||||
→ WebSocket → FastAPI → DashScope realtime API
|
||||
→ transcript JSON → QueryInput (in real time)
|
||||
```
|
||||
Auto-starts when video plays, stops on pause/seek/end. Partial transcript flows into QueryInput.
|
||||
|
||||
**Mode B — Full Transcript (batch, manual button):**
|
||||
```
|
||||
User clicks "Full Transcript" under video player
|
||||
→ POST /api/v1/video/{id}/transcribe
|
||||
→ Backend: ffmpeg extracts audio from uploaded video
|
||||
→ DashScope OpenAI-compatible API (non-streaming)
|
||||
→ Complete transcript of entire video → QueryInput
|
||||
```
|
||||
Server-side audio extraction via ffmpeg. No browser involvement.
|
||||
|
||||
### Changes From Previous Versions
|
||||
|
||||
| Aspect | V2 (mic capture) | V3 (video audio capture) |
|
||||
|---|---|---|
|
||||
| Audio source | getUserMedia() microphone | createMediaElementSource(video) |
|
||||
| Trigger | Manual record button | Auto on video play, stop on pause/end |
|
||||
| Permissions | Microphone permission required | None |
|
||||
| Batch mode | No | Yes — "Full Transcript" button |
|
||||
| Backend ffmpeg | Not needed | For Full Transcript mode |
|
||||
|
||||
---
|
||||
|
||||
## 2. User Flow
|
||||
|
||||
1. User uploads video → appears in left panel player
|
||||
2. User presses play → browser captures video audio → streams to DashScope → transcript in QueryInput
|
||||
3. User pauses/seeks/ends → streaming stops, accumulated transcript stays in QueryInput
|
||||
4. User edits transcript in QueryInput and clicks Submit → Phase 1 RAG pipeline
|
||||
5. **Full Transcript**: clicks "Full Transcript" button → server extracts audio → batch ASR → complete transcript fills QueryInput
|
||||
|
||||
---
|
||||
|
||||
## 3. Sub-Phases
|
||||
|
||||
### Phase 2.1 — Configuration & Infrastructure Setup (0.5 day)
|
||||
|
||||
Config fields, directory structure, service/router/model skeletons, register routers.
|
||||
|
||||
**Test:** `test_phase2_config.py`
|
||||
|
||||
**Tasks:**
|
||||
| # | Task | File |
|
||||
|---|------|------|
|
||||
| 2.1.1 | Add 6 config fields: `dashscope_api_key`, `asr_model_name`, `asr_realtime_model_name`, `video_upload_dir`, `max_video_size_mb`, `supported_video_formats` | `core/config.py` |
|
||||
| 2.1.2 | Update `.env.example` | `.env.example` |
|
||||
| 2.1.3 | Add deps: `dashscope>=0.4.0`, `openai>=1.52.0`, `zhconv>=1.4.0`, `python-multipart`, `aiofiles` | `requirements.txt` |
|
||||
| 2.1.4 | Create `models/video.py` — `VideoUploadResponse`, `FullTranscriptResponse` | `models/video.py` |
|
||||
| 2.1.5 | Create `models/asr.py` — `ASRTranscriptEvent` | `models/asr.py` |
|
||||
| 2.1.6 | Create `services/video_service.py`, `services/asr_client.py` stubs | `services/` |
|
||||
| 2.1.7 | Create `routers/video.py` stub: `POST /upload`, `GET /{id}`, `POST /{id}/transcribe` | `routers/video.py` |
|
||||
| 2.1.8 | Create `routers/ws_asr.py` stub: `WS /ws/asr/{video_id}?language=yue` | `routers/ws_asr.py` |
|
||||
| 2.1.9 | Register routers in `main.py` | `main.py` |
|
||||
| 2.1.10 | Write and pass `test_phase2_config.py` | `app/test/` |
|
||||
|
||||
---
|
||||
|
||||
### Phase 2.2 — Video Upload Backend (0.5 day)
|
||||
|
||||
Streaming upload with size/format validation. Reuses `routers/ingest.py` pattern.
|
||||
|
||||
**Test:** `test_phase2_video_upload.py` (implement 4 existing stubs)
|
||||
|
||||
**Tasks:**
|
||||
| # | Task | File |
|
||||
|---|------|------|
|
||||
| 2.2.1 | Write tests — implement all 4 stubs | `test_phase2_video_upload.py` |
|
||||
| 2.2.2 | Implement `VideoService.validate_video()`, `save_video()` (streaming, aiofiles) | `services/video_service.py` |
|
||||
| 2.2.3 | Implement `VideoService.get_video_path()`, `delete_video()` | `services/video_service.py` |
|
||||
| 2.2.4 | Implement `POST /api/v1/video/upload` route | `routers/video.py` |
|
||||
| 2.2.5 | Implement `GET /api/v1/video/{video_id}` route (FileResponse) | `routers/video.py` |
|
||||
| 2.2.6 | Run tests → pass → commit | — |
|
||||
|
||||
---
|
||||
|
||||
### Phase 2.3 — ASR WebSocket Proxy + Full Transcript Backend (1 day)
|
||||
|
||||
Two backend ASR paths: real-time streaming (WebSocket proxy to DashScope) and batch (ffmpeg extract → DashScope non-streaming API).
|
||||
|
||||
**Reference:** `.examples/alibaba_asr_backend.py`
|
||||
|
||||
**Tests:** `test_phase2_asr_client.py` (3 stubs), `test_phase2_ws_asr.py` (3 stubs), `test_phase2_ws_protocol.py` (new), `test_phase2_full_transcript.py` (new)
|
||||
|
||||
**Acceptance Criteria:**
|
||||
- WebSocket `/ws/asr/{video_id}?language=yue` → Float32 PCM → S16_LE base64 → DashScope realtime
|
||||
- `transcription.text` events → `{"full_text": "...", "is_final": false}` to browser
|
||||
- `transcription.completed` events → `{"full_text": "...", "is_final": true}` to browser
|
||||
- Language: `yue` (Cantonese), `zh`, `en`, `auto`
|
||||
- Traditional Chinese via `zhconv`
|
||||
- `POST /api/v1/video/{video_id}/transcribe` → ffmpeg extract audio → DashScope batch → `{"text": "..."}`
|
||||
- `DASHSCOPE_API_KEY` not set → clear error
|
||||
- Client disconnect → DashScope session closed cleanly
|
||||
|
||||
**Tasks:**
|
||||
| # | Task | File |
|
||||
|---|------|------|
|
||||
| 2.3.1 | Write tests first | `app/test/` |
|
||||
| 2.3.2 | `float32_to_s16le()`, `build_display_text()`, `_to_traditional()` | `services/asr_client.py` |
|
||||
| 2.3.3 | `DashScopeCallback` (sync SDK → asyncio.Queue bridge) + `_ws_proxy_dashscope()` | `routers/ws_asr.py` |
|
||||
| 2.3.4 | WebSocket endpoint | `routers/ws_asr.py` |
|
||||
| 2.3.5 | `VideoService.extract_audio()` — ffmpeg async subprocess: PCM16LE 16kHz mono | `services/video_service.py` |
|
||||
| 2.3.6 | `ASRClient.transcribe_full()` — batch: WAV → DashScope OpenAI-compatible API | `services/asr_client.py` |
|
||||
| 2.3.7 | `POST /api/v1/video/{video_id}/transcribe` route | `routers/video.py` |
|
||||
| 2.3.8 | Enhance `conftest.py` mock_asr_client | `conftest.py` |
|
||||
| 2.3.9 | Run tests → pass → commit | — |
|
||||
|
||||
---
|
||||
|
||||
### Phase 2.4 — Transcript → QueryInput + Full Transcript Button (0.5 day)
|
||||
|
||||
Wire up real-time transcript from streaming ASR into QueryInput. Full Transcript button wiring.
|
||||
|
||||
**Tests:** `test_phase2_useVideoASR.test.ts`, `test_phase2_useFullTranscript.test.ts`, `test_phase2_QueryInput_integration.test.tsx`
|
||||
|
||||
**Acceptance Criteria:**
|
||||
- `useVideoASR` hook: auto-starts on video `play`, stops on `pause`/`ended`
|
||||
- `useVideoASR` exposes `transcript`, `partialTranscript`, `isStreaming`, `status`
|
||||
- `useFullTranscript` hook: `requestFullTranscript()` → loading → transcript → error
|
||||
- QueryInput shows transcript (grey italic = partial, black = final)
|
||||
- QueryInput accepts `partialText` prop
|
||||
|
||||
**Tasks:**
|
||||
| # | Task | File |
|
||||
|---|------|------|
|
||||
| 2.4.1 | Write tests first | `src/test/` |
|
||||
| 2.4.2 | Create `hooks/useVideoASR.ts` (see `.examples/alibaba_asr_frontend_react.tsx`) | `hooks/useVideoASR.ts` |
|
||||
| 2.4.3 | Create `hooks/useFullTranscript.ts` | `hooks/useFullTranscript.ts` |
|
||||
| 2.4.4 | Update `types/index.ts` — `ASRMessage`, `ASRStatus`, `FullTranscriptResponse` | `types/index.ts` |
|
||||
| 2.4.5 | Update `QueryInput.tsx` — add `partialText` prop | `components/QueryInput.tsx` |
|
||||
| 2.4.6 | Run tests → pass → commit | — |
|
||||
|
||||
---
|
||||
|
||||
### Phase 2.5 — Frontend: Video Player + Buttons + Layout (1.5 days)
|
||||
|
||||
Replace `VideoPlaceholder` with video upload + player. ASR auto on play. Full Transcript button.
|
||||
|
||||
```
|
||||
┌─────────────────────┬──────────────────────────┐
|
||||
│ VideoUpload / │ QueryInput │ ← Upper Panel (30%)
|
||||
│ VideoPlayer │ (transcript flows here │
|
||||
│ │ from video audio ASR) │
|
||||
│ [Full Transcript] │ [Submit] │
|
||||
├─────────────────────┴──────────────────────────┤
|
||||
│ ResponsePanel │ ← Lower Panel (70%)
|
||||
└────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
**Tests:** `test_phase2_VideoUpload.test.tsx`, `test_phase2_VideoPlayer.test.tsx`, `test_phase2_LTTPage_integration.test.tsx`
|
||||
|
||||
**Acceptance Criteria:**
|
||||
- Drag-and-drop video upload with progress bar (native HTML5)
|
||||
- Video player with native `<video controls>` exposing ref
|
||||
- ASR auto on play → transcript in QueryInput; stops on pause/end
|
||||
- "Full Transcript" button → loading spinner → fills QueryInput with full transcript
|
||||
- Error states: upload fails, ASR fails, Full Transcript fails → clear messages
|
||||
|
||||
**Tasks:**
|
||||
| # | Task | File |
|
||||
|---|------|------|
|
||||
| 2.5.1 | Write all 3 tests first | `src/test/` |
|
||||
| 2.5.2 | Create `VideoUpload.tsx` — native drag-drop, axios progress | `components/VideoUpload.tsx` |
|
||||
| 2.5.3 | Create `VideoPlayer.tsx` — native `<video controls>`, forwardRef | `components/VideoPlayer.tsx` |
|
||||
| 2.5.4 | Update `types/index.ts` | `types/index.ts` |
|
||||
| 2.5.5 | Update `lib/api.ts` — `uploadVideo()`, `getVideoUrl()`, `requestFullTranscript()` | `lib/api.ts` |
|
||||
| 2.5.6 | Update `lib/queries.tsx` — `useVideoUpload()` | `lib/queries.tsx` |
|
||||
| 2.5.7 | Refactor `LTTPage.tsx` — replace VideoPlaceholder, wire hooks + QueryInput | `pages/LTTPage.tsx` |
|
||||
| 2.5.8 | Update `QueryInput.tsx` — transcript value + partial text styling | `components/QueryInput.tsx` |
|
||||
| 2.5.9 | Run tests → pass → commit | — |
|
||||
|
||||
---
|
||||
|
||||
### Phase 2.6 — Integration & Acceptance Testing (1 day)
|
||||
|
||||
**Tests:** `test_integration_phase2.py`, `test_acceptance_phase2_video.py`, `test_acceptance_phase2_asr.py`, `test_acceptance_integration_phase2.py`
|
||||
|
||||
**Tasks:**
|
||||
| # | Task |
|
||||
|---|------|
|
||||
| 2.6.1 | Implement integration test (mocked DashScope, real ChromaDB + file I/O) |
|
||||
| 2.6.2 | Implement acceptance: real video upload + Full Transcript |
|
||||
| 2.6.3 | Implement acceptance: real DashScope streaming + batch |
|
||||
| 2.6.4 | Implement E2E acceptance |
|
||||
| 2.6.5 | Full regression run |
|
||||
| 2.6.6 | Fix failures, final commit |
|
||||
|
||||
---
|
||||
|
||||
### Phase 2.7 — Polish & Deployment (0.5 day)
|
||||
|
||||
| # | Task |
|
||||
|---|------|
|
||||
| 2.7.1 | Structured logging for DashScope proxy + full transcript events |
|
||||
| 2.7.2 | Update `nginx.conf` — `client_max_body_size` 350M |
|
||||
| 2.7.3 | Verify production build |
|
||||
| 2.7.4 | Update `README.md` |
|
||||
| 2.7.5 | Final commit |
|
||||
|
||||
---
|
||||
|
||||
## 4. Timeline
|
||||
|
||||
| Sub-Phase | Description | Effort | Depends On |
|
||||
|---|---|---|---|
|
||||
| 2.1 | Config & Infrastructure | 0.5 day | — |
|
||||
| 2.2 | Video Upload Backend | 0.5 day | 2.1 |
|
||||
| 2.3 | ASR Proxy + Full Transcript | 1 day | 2.1 |
|
||||
| 2.4 | Transcript → QueryInput | 0.5 day | 2.3 |
|
||||
| 2.5 | Frontend: Layout + Buttons | 1.5 days | 2.2, 2.3 |
|
||||
| 2.6 | Integration & Acceptance | 1 day | 2.4, 2.5 |
|
||||
| 2.7 | Polish & Deployment | 0.5 day | 2.6 |
|
||||
| **Total** | | **5.5 days** | |
|
||||
|
||||
2.2 (upload) and 2.3 (ASR) run concurrently.
|
||||
|
||||
---
|
||||
|
||||
## 5. Dependencies
|
||||
|
||||
**Backend:** `dashscope>=0.4.0`, `openai>=1.52.0`, `zhconv>=1.4.0`, `python-multipart`, `aiofiles`
|
||||
**Frontend:** None (native Web APIs: `AudioContext.createMediaElementSource`, `ScriptProcessorNode`, `<video>`, HTML5 drag-and-drop)
|
||||
**System:** ffmpeg on server (for Full Transcript audio extraction)
|
||||
|
||||
---
|
||||
|
||||
## 6. Config Fields
|
||||
|
||||
```python
|
||||
dashscope_api_key: str = ""
|
||||
asr_model_name: str = "qwen3-asr-flash" # Batch API
|
||||
asr_realtime_model_name: str = "qwen3-asr-flash-realtime" # Streaming
|
||||
video_upload_dir: str = "./uploads"
|
||||
max_video_size_mb: int = 300
|
||||
supported_video_formats: list[str] = [".mp4", ".webm", ".mov", ".avi", ".mkv"]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 7. Key Design Decisions
|
||||
|
||||
| Decision | Choice | Why |
|
||||
|---|---|---|
|
||||
| Audio source | `createMediaElementSource(video)` | Captures video audio during playback. No mic permission. |
|
||||
| ASR auto trigger | Video `play` event | Transcript appears as user watches. Natural UX. |
|
||||
| ASR stop trigger | Video `pause`/`ended` events | Clean lifecycle. New session on next play/seek. |
|
||||
| Full Transcript | Manual button + server ffmpeg | User explicitly requests. Server has the file. |
|
||||
| Full Transcript ASR | DashScope OpenAI-compatible API | Standard `/v1/chat/completions` with `input_audio`. WAV format. |
|
||||
| ASR streaming | DashScope realtime SDK | `OmniRealtimeConversation` + callback → asyncio.Queue bridge |
|
||||
| Transcript display | QueryInput textarea | Editable. Same box for typing or ASR output. |
|
||||
| SSL | Required | Chrome may block `createMediaElementSource` without secure context. |
|
||||
|
||||
---
|
||||
|
||||
## 8. File Manifest
|
||||
|
||||
### New Files
|
||||
```
|
||||
backend/
|
||||
app/routers/video.py
|
||||
app/routers/ws_asr.py
|
||||
app/services/video_service.py
|
||||
app/services/asr_client.py
|
||||
app/models/video.py
|
||||
app/models/asr.py
|
||||
app/test/test_phase2_config.py
|
||||
app/test/test_phase2_ws_protocol.py
|
||||
app/test/test_phase2_full_transcript.py
|
||||
app/test/test_phase2_transcript_to_rag.py
|
||||
|
||||
frontend/src/
|
||||
components/VideoUpload.tsx
|
||||
components/VideoPlayer.tsx
|
||||
hooks/useVideoASR.ts
|
||||
hooks/useFullTranscript.ts
|
||||
test/test_phase2_VideoUpload.test.tsx
|
||||
test/test_phase2_VideoPlayer.test.tsx
|
||||
test/test_phase2_useVideoASR.test.ts
|
||||
test/test_phase2_useFullTranscript.test.ts
|
||||
test/test_phase2_QueryInput_integration.test.tsx
|
||||
test/test_phase2_LTTPage_integration.test.tsx
|
||||
```
|
||||
|
||||
### Modified Files
|
||||
```
|
||||
backend/app/core/config.py, main.py, test/conftest.py, .env.example, requirements.txt
|
||||
frontend/src/pages/LTTPage.tsx, components/QueryInput.tsx, lib/api.ts, lib/queries.tsx, types/index.ts
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 9. Reference Code (`.examples/`)
|
||||
|
||||
| File | Content |
|
||||
|---|---|
|
||||
| `alibaba_asr_backend.py` | DashScope WebSocket proxy + non-streaming fallback (FastAPI) |
|
||||
| `alibaba_asr_frontend_vanilla.html` | Browser audio capture (vanilla JS, original) |
|
||||
| `alibaba_asr_frontend_react.tsx` | React/TS: `useVideoASR` (streaming) + `useFullTranscript` (batch) hooks |
|
||||
| `README.md` | Architecture overview + dependency notes |
|
||||
|
|
@ -26,3 +26,13 @@ PROMPTS_DB_PATH=./data/prompts.db
|
|||
HISTORY_DB_PATH=./data/history.db
|
||||
|
||||
CORS_ORIGINS=["http://localhost:5173","http://localhost:3000"]
|
||||
|
||||
# Alibaba Cloud DashScope ASR (Phase 2)
|
||||
# Get your key from: https://modelstudio.console.alibabacloud.com
|
||||
DASHSCOPE_API_KEY=sk-your-dashscope-key-here
|
||||
ASR_MODEL_NAME=qwen3-asr-flash
|
||||
ASR_REALTIME_MODEL_NAME=qwen3-asr-flash-realtime
|
||||
|
||||
# Video upload (Phase 2)
|
||||
VIDEO_UPLOAD_DIR=./uploads
|
||||
MAX_VIDEO_SIZE_MB=300
|
||||
|
|
|
|||
|
|
@ -44,6 +44,16 @@ class Settings(BaseSettings):
|
|||
relevance_threshold: float = 7.0
|
||||
llm_timeout: float = 60.0
|
||||
|
||||
# Alibaba Cloud DashScope ASR (Phase 2)
|
||||
dashscope_api_key: str = ""
|
||||
asr_model_name: str = "qwen3-asr-flash"
|
||||
asr_realtime_model_name: str = "qwen3-asr-flash-realtime"
|
||||
|
||||
# Video upload (Phase 2)
|
||||
video_upload_dir: str = "./uploads"
|
||||
max_video_size_mb: int = 300
|
||||
supported_video_formats: list[str] = [".mp4", ".webm", ".mov", ".avi", ".mkv"]
|
||||
|
||||
# Development helpers
|
||||
model_config = {"env_file": ".env", "env_file_encoding": "utf-8"}
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ from fastapi import FastAPI
|
|||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import FileResponse
|
||||
|
||||
from app.routers import ingest, query, documents, prompts, history, chunks
|
||||
from app.routers import ingest, query, documents, prompts, history, chunks, video, ws_asr
|
||||
from app.core.config import get_settings
|
||||
from app.core.sqlite_db import (
|
||||
get_prompts_db,
|
||||
|
|
@ -56,6 +56,8 @@ app.include_router(documents.router, prefix="/api/v1")
|
|||
app.include_router(prompts.router)
|
||||
app.include_router(history.router)
|
||||
app.include_router(chunks.router)
|
||||
app.include_router(video.router, prefix="/api/v1")
|
||||
app.include_router(ws_asr.router)
|
||||
|
||||
_prompts_conn = get_prompts_db()
|
||||
init_prompts_db(_prompts_conn)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,8 @@
|
|||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class ASRTranscriptEvent(BaseModel):
|
||||
delta: str = ""
|
||||
full_text: str
|
||||
language: str
|
||||
is_final: bool
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class VideoUploadResponse(BaseModel):
|
||||
video_id: str
|
||||
filename: str
|
||||
size_bytes: int
|
||||
url: str
|
||||
|
||||
|
||||
class VideoInfo(BaseModel):
|
||||
video_id: str
|
||||
filename: str
|
||||
size_bytes: int
|
||||
upload_date: str
|
||||
|
||||
|
||||
class FullTranscriptResponse(BaseModel):
|
||||
text: str
|
||||
language: str
|
||||
duration_seconds: float | None = None
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
import logging
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
import aiofiles
|
||||
from fastapi import APIRouter, UploadFile, File, HTTPException
|
||||
from fastapi.responses import FileResponse
|
||||
|
||||
from app.models.video import VideoUploadResponse
|
||||
from app.services.video_service import VideoService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(tags=["video"])
|
||||
|
||||
|
||||
def _get_video_service() -> VideoService:
|
||||
from app.core.config import get_settings
|
||||
|
||||
s = get_settings()
|
||||
return VideoService(
|
||||
upload_dir=s.video_upload_dir,
|
||||
max_size_mb=s.max_video_size_mb,
|
||||
supported_formats=s.supported_video_formats,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/video/upload", response_model=VideoUploadResponse)
|
||||
async def upload_video(file: UploadFile = File(...)):
|
||||
service = _get_video_service()
|
||||
filename = file.filename or "unknown"
|
||||
ext = Path(filename).suffix.lower()
|
||||
|
||||
total_size = 0
|
||||
video_id = uuid.uuid4().hex[:12]
|
||||
dest_path = service.upload_dir / f"{video_id}{ext}"
|
||||
|
||||
try:
|
||||
async with aiofiles.open(dest_path, "wb") as out:
|
||||
while chunk := await file.read(1024 * 1024):
|
||||
total_size += len(chunk)
|
||||
if total_size > service.max_size_bytes:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"File exceeds {service.max_size_bytes // 1024 // 1024}MB limit",
|
||||
)
|
||||
await out.write(chunk)
|
||||
except HTTPException:
|
||||
dest_path.unlink(missing_ok=True)
|
||||
raise
|
||||
except Exception:
|
||||
dest_path.unlink(missing_ok=True)
|
||||
raise HTTPException(status_code=500, detail="Upload failed")
|
||||
|
||||
service.validate_video(filename, file.content_type, total_size)
|
||||
logger.info("Video uploaded: id=%s filename=%s size=%d", video_id, filename, total_size)
|
||||
|
||||
return VideoUploadResponse(
|
||||
video_id=video_id,
|
||||
filename=filename,
|
||||
size_bytes=total_size,
|
||||
url=f"/api/v1/video/{video_id}",
|
||||
)
|
||||
|
||||
|
||||
@router.get("/video/{video_id}")
|
||||
async def serve_video(video_id: str):
|
||||
service = _get_video_service()
|
||||
video_path = service.get_video_path(video_id)
|
||||
ext = video_path.suffix.lower()
|
||||
media_types = {
|
||||
".mp4": "video/mp4",
|
||||
".webm": "video/webm",
|
||||
".mov": "video/quicktime",
|
||||
".avi": "video/x-msvideo",
|
||||
".mkv": "video/x-matroska",
|
||||
}
|
||||
return FileResponse(str(video_path), media_type=media_types.get(ext, "application/octet-stream"))
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
import logging
|
||||
|
||||
from fastapi import APIRouter, WebSocket, WebSocketDisconnect
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter(tags=["asr"])
|
||||
|
||||
|
||||
@router.websocket("/ws/asr/{video_id}")
|
||||
async def ws_asr_endpoint(websocket: WebSocket, video_id: str, language: str = "yue"):
|
||||
await websocket.accept()
|
||||
try:
|
||||
while True:
|
||||
await websocket.receive_bytes()
|
||||
except WebSocketDisconnect:
|
||||
pass
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
import struct
|
||||
import base64
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def float32_to_s16le(float32_bytes: bytes) -> bytes:
|
||||
num_samples = len(float32_bytes) // 4
|
||||
floats = struct.unpack(f"<{num_samples}f", float32_bytes)
|
||||
int16_samples = [max(-32768, min(32767, int(f * 32767.0))) for f in floats]
|
||||
return struct.pack(f"<{num_samples}h", *int16_samples)
|
||||
|
||||
|
||||
def build_display_text(accumulated: str, current: str) -> str:
|
||||
parts = [p for p in (accumulated, current) if p and p.strip()]
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
class ASRClient:
|
||||
def __init__(self, settings):
|
||||
self.settings = settings
|
||||
|
||||
async def transcribe_full(self, audio_bytes: bytes, language: str = "yue") -> str:
|
||||
raise NotImplementedError
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
from pathlib import Path
|
||||
from fastapi import HTTPException
|
||||
|
||||
|
||||
class VideoService:
|
||||
def __init__(self, upload_dir: str, max_size_mb: int, supported_formats: list[str]):
|
||||
self.upload_dir = Path(upload_dir)
|
||||
self.max_size_bytes = max_size_mb * 1024 * 1024
|
||||
self.supported_formats = supported_formats
|
||||
self.upload_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def validate_video(self, filename: str | None, content_type: str | None, size_bytes: int) -> None:
|
||||
if not filename:
|
||||
raise HTTPException(status_code=400, detail="No file selected")
|
||||
ext = Path(filename).suffix.lower()
|
||||
if ext not in self.supported_formats:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Unsupported format: {ext}. Supported: {', '.join(self.supported_formats)}",
|
||||
)
|
||||
if size_bytes > self.max_size_bytes:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"File exceeds {self.max_size_bytes // 1024 // 1024}MB limit",
|
||||
)
|
||||
|
||||
def get_video_path(self, video_id: str) -> Path:
|
||||
candidates = list(self.upload_dir.glob(f"{video_id}.*"))
|
||||
if not candidates:
|
||||
raise HTTPException(status_code=404, detail="Video not found")
|
||||
return candidates[0]
|
||||
|
||||
def delete_video(self, video_id: str) -> None:
|
||||
for p in self.upload_dir.glob(f"{video_id}.*"):
|
||||
p.unlink()
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
"""Phase 2 config tests: ASR and video settings."""
|
||||
import pytest
|
||||
|
||||
|
||||
def test_config_asr_defaults(monkeypatch, tmp_path):
|
||||
monkeypatch.delenv("DASHSCOPE_API_KEY", raising=False)
|
||||
monkeypatch.delenv("ASR_MODEL_NAME", raising=False)
|
||||
monkeypatch.delenv("ASR_REALTIME_MODEL_NAME", raising=False)
|
||||
monkeypatch.setenv("LLM_API_KEY", "sk-test")
|
||||
monkeypatch.setenv("DP_API_KEY", "sk-test")
|
||||
monkeypatch.setenv("EMBEDDING_API_KEY", "sk-test")
|
||||
env_file = tmp_path / ".env"
|
||||
env_file.write_text("")
|
||||
monkeypatch.chdir(tmp_path)
|
||||
|
||||
from app.core.config import Settings, get_settings
|
||||
get_settings.cache_clear()
|
||||
settings = Settings(_env_file=())
|
||||
assert settings.dashscope_api_key == ""
|
||||
assert settings.asr_model_name == "qwen3-asr-flash"
|
||||
assert settings.asr_realtime_model_name == "qwen3-asr-flash-realtime"
|
||||
|
||||
|
||||
def test_config_video_defaults(monkeypatch, tmp_path):
|
||||
monkeypatch.delenv("VIDEO_UPLOAD_DIR", raising=False)
|
||||
monkeypatch.delenv("MAX_VIDEO_SIZE_MB", raising=False)
|
||||
monkeypatch.setenv("LLM_API_KEY", "sk-test")
|
||||
monkeypatch.setenv("DP_API_KEY", "sk-test")
|
||||
monkeypatch.setenv("EMBEDDING_API_KEY", "sk-test")
|
||||
env_file = tmp_path / ".env"
|
||||
env_file.write_text("")
|
||||
monkeypatch.chdir(tmp_path)
|
||||
|
||||
from app.core.config import Settings, get_settings
|
||||
get_settings.cache_clear()
|
||||
settings = Settings(_env_file=())
|
||||
assert settings.video_upload_dir == "./uploads"
|
||||
assert settings.max_video_size_mb == 300
|
||||
assert ".mp4" in settings.supported_video_formats
|
||||
|
||||
|
||||
def test_config_loads_from_env(tmp_path, monkeypatch):
|
||||
env_file = tmp_path / ".env"
|
||||
env_file.write_text(
|
||||
"DASHSCOPE_API_KEY=sk-test-key\n"
|
||||
"ASR_MODEL_NAME=qwen3-asr-flash\n"
|
||||
"ASR_REALTIME_MODEL_NAME=qwen3-asr-flash-realtime\n"
|
||||
"VIDEO_UPLOAD_DIR=./test_uploads\n"
|
||||
"MAX_VIDEO_SIZE_MB=500\n"
|
||||
"LLM_API_KEY=sk-test\n"
|
||||
"DP_API_KEY=sk-test\n"
|
||||
"EMBEDDING_API_KEY=sk-test\n"
|
||||
)
|
||||
monkeypatch.chdir(tmp_path)
|
||||
from app.core.config import Settings, get_settings
|
||||
get_settings.cache_clear()
|
||||
|
||||
settings = Settings()
|
||||
assert settings.dashscope_api_key == "sk-test-key"
|
||||
assert settings.asr_model_name == "qwen3-asr-flash"
|
||||
assert settings.asr_realtime_model_name == "qwen3-asr-flash-realtime"
|
||||
assert settings.video_upload_dir == "./test_uploads"
|
||||
assert settings.max_video_size_mb == 500
|
||||
|
|
@ -1,29 +1,84 @@
|
|||
"""Phase 2 tests: Video upload endpoint.
|
||||
|
||||
Covers:
|
||||
- POST /api/v1/upload-video with size validation (<300MB)
|
||||
- POST /api/v1/video/upload with size validation (<300MB)
|
||||
- Format validation (MP4 and common formats)
|
||||
- Static file serving
|
||||
- Error handling for oversized/invalid files
|
||||
"""
|
||||
import pytest
|
||||
from fastapi import FastAPI
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def video_client(tmp_path, monkeypatch):
|
||||
upload_dir = tmp_path / "test_uploads"
|
||||
upload_dir.mkdir()
|
||||
monkeypatch.setenv("VIDEO_UPLOAD_DIR", str(upload_dir))
|
||||
monkeypatch.setenv("MAX_VIDEO_SIZE_MB", "10")
|
||||
|
||||
from app.routers.video import router
|
||||
from app.core.config import get_settings
|
||||
|
||||
get_settings.cache_clear()
|
||||
app = FastAPI()
|
||||
app.include_router(router, prefix="/api/v1")
|
||||
return TestClient(app), upload_dir
|
||||
|
||||
|
||||
class TestVideoUpload:
|
||||
"""Video upload endpoint tests."""
|
||||
def test_upload_mp4_success(self, video_client):
|
||||
client, upload_dir = video_client
|
||||
content = b"\x00" * 1024 * 100 # 100KB dummy data
|
||||
resp = client.post(
|
||||
"/api/v1/video/upload",
|
||||
files={"file": ("test.mp4", content, "video/mp4")},
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert "video_id" in data
|
||||
assert data["filename"] == "test.mp4"
|
||||
assert data["size_bytes"] == 102400
|
||||
assert data["url"].startswith("/api/v1/video/")
|
||||
assert len(list(upload_dir.glob("*"))) == 1
|
||||
|
||||
def test_upload_mp4_success(self):
|
||||
"""Should accept valid MP4 under 300MB."""
|
||||
pass # TODO: implement
|
||||
def test_upload_size_limit(self, video_client):
|
||||
client, upload_dir = video_client
|
||||
content = b"\x00" * (11 * 1024 * 1024) # 11MB, limit is 10MB
|
||||
resp = client.post(
|
||||
"/api/v1/video/upload",
|
||||
files={"file": ("big.mp4", content, "video/mp4")},
|
||||
)
|
||||
assert resp.status_code == 413
|
||||
assert len(list(upload_dir.glob("*"))) == 0
|
||||
|
||||
def test_upload_size_limit(self):
|
||||
"""Should reject files over 300MB."""
|
||||
pass # TODO: implement
|
||||
def test_upload_invalid_format(self, video_client):
|
||||
client, upload_dir = video_client
|
||||
content = b"hello world"
|
||||
resp = client.post(
|
||||
"/api/v1/video/upload",
|
||||
files={"file": ("doc.txt", content, "text/plain")},
|
||||
)
|
||||
assert resp.status_code == 400
|
||||
assert "Unsupported format" in resp.json()["detail"]
|
||||
|
||||
def test_upload_invalid_format(self):
|
||||
"""Should reject non-video formats."""
|
||||
pass # TODO: implement
|
||||
def test_static_file_serving(self, video_client):
|
||||
client, upload_dir = video_client
|
||||
content = b"\x00" * 512
|
||||
resp = client.post(
|
||||
"/api/v1/video/upload",
|
||||
files={"file": ("serve_test.mp4", content, "video/mp4")},
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
video_id = resp.json()["video_id"]
|
||||
|
||||
def test_static_file_serving(self):
|
||||
"""Should serve uploaded video via static URL."""
|
||||
pass # TODO: implement
|
||||
resp = client.get(f"/api/v1/video/{video_id}")
|
||||
assert resp.status_code == 200
|
||||
assert resp.headers["content-type"] == "video/mp4"
|
||||
assert resp.content == content
|
||||
|
||||
def test_unknown_video_returns_404(self, video_client):
|
||||
client, _ = video_client
|
||||
resp = client.get("/api/v1/video/nonexistent")
|
||||
assert resp.status_code == 404
|
||||
|
|
|
|||
|
|
@ -16,3 +16,6 @@ python-multipart>=0.0.6
|
|||
reportlab>=4.2.5
|
||||
langchain>=1.2.12,<1.3.0
|
||||
langchain-openai>=1.1.11,<1.2.0
|
||||
dashscope>=0.4.0
|
||||
aiofiles>=24.0.0
|
||||
zhconv>=1.4.0
|
||||
|
|
|
|||
Loading…
Reference in New Issue