feat: Phase 2.1 config + infrastructure and 2.2 video upload backend
- Add DashScope ASR and video upload config fields to Settings
- Create Pydantic models (video.py, asr.py)
- Create VideoService with validation, save, serve, delete
- Create ASR client stub with float32_to_s16le utility
- Implement POST /api/v1/video/upload with streaming validation
- Implement GET /api/v1/video/{video_id} with FileResponse
- Create WebSocket ASR endpoint stub
- Register new routers in main.py
- Update .env.example and requirements.txt
- Add reference examples for DashScope integration
- 8 tests passing (3 config + 5 video upload)
This commit is contained in:
parent
63e4c1a385
commit
9934749d2b
|
|
@ -0,0 +1,49 @@
|
||||||
|
# Alibaba Cloud DashScope ASR — Reference Examples
|
||||||
|
|
||||||
|
Adapted from `/mnt/c/Users/woody/Documents/projects/voice input/` (Cantonese voice-to-text web app).
|
||||||
|
|
||||||
|
## Files
|
||||||
|
|
||||||
|
| File | What | Language |
|
||||||
|
|------|------|----------|
|
||||||
|
| `alibaba_asr_backend.py` | FastAPI WebSocket proxy to DashScope realtime ASR | Python |
|
||||||
|
| `alibaba_asr_frontend_vanilla.html` | Browser audio capture + WebSocket (vanilla JS) | HTML/JS |
|
||||||
|
| `alibaba_asr_frontend_react.tsx` | React/TS hook + component for audio capture | TypeScript/React |
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
Browser (Float32 PCM, 16kHz mono)
|
||||||
|
│ WebSocket: send(float32Data.buffer)
|
||||||
|
▼
|
||||||
|
FastAPI Backend (/ws/asr/{video_id})
|
||||||
|
│ Convert Float32 → S16_LE → base64
|
||||||
|
▼
|
||||||
|
Alibaba Cloud DashScope (wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime)
|
||||||
|
│ Model: qwen3-asr-flash-realtime
|
||||||
|
▼ Language: yue (Cantonese)
|
||||||
|
Transcript JSON → Browser
|
||||||
|
```
|
||||||
|
|
||||||
|
## Key Details
|
||||||
|
|
||||||
|
- **Audio format**: Float32 PCM, 16kHz, mono (browser) → S16_LE PCM, 16kHz, mono, base64 (DashScope)
|
||||||
|
- **Model**: `qwen3-asr-flash-realtime` (WebSocket realtime, unlimited duration)
|
||||||
|
- **Endpoint**: `wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime`
|
||||||
|
- **SDK**: `pip install dashscope>=0.4.0`
|
||||||
|
- **Cantonese**: Language code `yue` (works natively with DashScope)
|
||||||
|
- **VAD**: Server-side (Alibaba Cloud handles voice activity detection)
|
||||||
|
- **Pricing**: ~$0.00009/second
|
||||||
|
- **Features**: Punctuation, ITN, filler word filtering, multi-language auto-detect
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
```
|
||||||
|
# Python
|
||||||
|
dashscope>=0.4.0
|
||||||
|
openai>=1.52.0
|
||||||
|
zhconv>=1.4.0 # Simplified → Traditional Chinese (optional)
|
||||||
|
|
||||||
|
# No additional JS deps needed — native Web APIs only:
|
||||||
|
# WebSocket, AudioContext, ScriptProcessorNode, getUserMedia
|
||||||
|
```
|
||||||
|
|
@ -0,0 +1,265 @@
|
||||||
|
"""
|
||||||
|
Reference: Alibaba Cloud DashScope Real-Time ASR Proxy (FastAPI WebSocket).
|
||||||
|
|
||||||
|
Extracted and adapted from:
|
||||||
|
/mnt/c/Users/woody/Documents/projects/voice input/backend/app.py
|
||||||
|
|
||||||
|
Architecture:
|
||||||
|
Browser (Float32 PCM) → FastAPI WebSocket → DashScope Real-Time WebSocket
|
||||||
|
API key NEVER leaves the server. Backend proxies audio to Alibaba Cloud.
|
||||||
|
|
||||||
|
Key imports:
|
||||||
|
pip install dashscope>=0.4.0 openai>=1.52.0 zhconv
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import struct
|
||||||
|
import asyncio
|
||||||
|
import base64
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import zhconv # Optional: simplified → traditional Chinese conversion
|
||||||
|
|
||||||
|
from dashscope.audio.qwen_omni.omni_realtime import (
|
||||||
|
OmniRealtimeConversation,
|
||||||
|
OmniRealtimeCallback,
|
||||||
|
TranscriptionParams,
|
||||||
|
MultiModality,
|
||||||
|
)
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
logger = logging.getLogger("asr-proxy")
|
||||||
|
|
||||||
|
DASHSCOPE_API_KEY = os.getenv("DASHSCOPE_API_KEY")
|
||||||
|
|
||||||
|
# ─── Audio Conversion: Float32 PCM → S16_LE ─────────────────────────────────
|
||||||
|
|
||||||
|
def float32_to_s16le(float32_bytes: bytes) -> bytes:
|
||||||
|
"""Convert browser Float32 PCM to S16_LE required by DashScope.
|
||||||
|
|
||||||
|
Browser: Float32Array (values -1.0 to 1.0) → raw bytes
|
||||||
|
DashScope: S16_LE PCM 16kHz mono → base64 encoded
|
||||||
|
"""
|
||||||
|
num_samples = len(float32_bytes) // 4
|
||||||
|
floats = struct.unpack(f"<{num_samples}f", float32_bytes)
|
||||||
|
int16_samples = [max(-32768, min(32767, int(f * 32767.0))) for f in floats]
|
||||||
|
return struct.pack(f"<{num_samples}h", *int16_samples)
|
||||||
|
|
||||||
|
|
||||||
|
def _to_traditional(text: str) -> str:
|
||||||
|
"""Convert Simplified Chinese to Traditional (for Cantonese display)."""
|
||||||
|
if not text:
|
||||||
|
return text
|
||||||
|
return zhconv.convert(text, "zh-hant")
|
||||||
|
|
||||||
|
|
||||||
|
def build_display_text(accumulated: str, current: str) -> str:
|
||||||
|
"""Assemble multi-utterance display text."""
|
||||||
|
parts = [p for p in (accumulated, current) if p and p.strip()]
|
||||||
|
return " ".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── DashScope Callback Bridge (Sync → Async) ───────────────────────────────
|
||||||
|
|
||||||
|
class DashScopeCallback(OmniRealtimeCallback):
|
||||||
|
"""Bridges sync DashScope SDK callbacks to async WebSocket messages.
|
||||||
|
|
||||||
|
The DashScope SDK fires callbacks from a background thread.
|
||||||
|
We push events to an asyncio.Queue so the async task can read them.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, event_queue: asyncio.Queue, loop: asyncio.AbstractEventLoop):
|
||||||
|
super().__init__()
|
||||||
|
self._queue = event_queue
|
||||||
|
self._loop = loop
|
||||||
|
|
||||||
|
def on_open(self):
|
||||||
|
logger.info("DashScope realtime connection opened")
|
||||||
|
|
||||||
|
def on_event(self, message):
|
||||||
|
"""Called from SDK background thread."""
|
||||||
|
try:
|
||||||
|
event = json.loads(message) if isinstance(message, str) else message
|
||||||
|
self._loop.call_soon_threadsafe(self._queue.put_nowait, event)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"DashScope callback error: {e}")
|
||||||
|
|
||||||
|
def on_close(self, code, msg):
|
||||||
|
logger.info(f"DashScope realtime closed: code={code}, msg={msg}")
|
||||||
|
|
||||||
|
|
||||||
|
# ─── WebSocket Proxy Handler ────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def ws_proxy_dashscope(
|
||||||
|
client_ws: WebSocket,
|
||||||
|
loop: asyncio.AbstractEventLoop,
|
||||||
|
language: str = "yue", # "yue" for Cantonese, "zh" for Mandarin, "en" for English
|
||||||
|
):
|
||||||
|
"""Proxy browser audio to DashScope real-time ASR.
|
||||||
|
|
||||||
|
Flow:
|
||||||
|
1. Browser sends Float32 PCM bytes via WebSocket
|
||||||
|
2. Backend converts to S16_LE → base64
|
||||||
|
3. Append audio to DashScope conversation
|
||||||
|
4. DashScope SDK sends events → callback → asyncio.Queue
|
||||||
|
5. Read events from queue → format as JSON → send to browser
|
||||||
|
|
||||||
|
Protocol (backend → browser):
|
||||||
|
Partial: {"delta": "", "full_text": "...", "language": "yue", "is_final": false}
|
||||||
|
Final: {"delta": "", "full_text": "...", "language": "yue", "is_final": true}
|
||||||
|
"""
|
||||||
|
event_queue: asyncio.Queue = asyncio.Queue()
|
||||||
|
callback = DashScopeCallback(event_queue, loop)
|
||||||
|
|
||||||
|
# Initialize DashScope real-time conversation
|
||||||
|
conversation = OmniRealtimeConversation(
|
||||||
|
model="qwen3-asr-flash-realtime",
|
||||||
|
url="wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime",
|
||||||
|
callback=callback,
|
||||||
|
)
|
||||||
|
|
||||||
|
# connect() is synchronous — run in executor to avoid blocking
|
||||||
|
await loop.run_in_executor(None, conversation.connect)
|
||||||
|
logger.info("dashscope-session-connected")
|
||||||
|
|
||||||
|
# Configure session: text output + audio transcription
|
||||||
|
transcription_kwargs: dict = {
|
||||||
|
"sample_rate": 16000,
|
||||||
|
"input_audio_format": "pcm",
|
||||||
|
}
|
||||||
|
if language != "auto":
|
||||||
|
transcription_kwargs["language"] = language
|
||||||
|
|
||||||
|
transcription_params = TranscriptionParams(**transcription_kwargs)
|
||||||
|
conversation.update_session(
|
||||||
|
output_modalities=[MultiModality.TEXT],
|
||||||
|
enable_input_audio_transcription=True,
|
||||||
|
transcription_params=transcription_params,
|
||||||
|
)
|
||||||
|
logger.info("dashscope-session-updated lang=%s", language)
|
||||||
|
|
||||||
|
accumulated_text = ""
|
||||||
|
current_lang = language
|
||||||
|
|
||||||
|
async def read_events():
|
||||||
|
"""Async task: read DashScope events from queue → send to browser."""
|
||||||
|
nonlocal accumulated_text, current_lang
|
||||||
|
while True:
|
||||||
|
event = await event_queue.get()
|
||||||
|
event_type = event.get("type", "")
|
||||||
|
|
||||||
|
if event_type == "conversation.item.input_audio_transcription.text":
|
||||||
|
# Partial result (in-progress utterance)
|
||||||
|
stash = event.get("stash", "")
|
||||||
|
display = (
|
||||||
|
build_display_text(accumulated_text, stash)
|
||||||
|
if stash else accumulated_text
|
||||||
|
)
|
||||||
|
await client_ws.send_json({
|
||||||
|
"delta": "",
|
||||||
|
"full_text": _to_traditional(display),
|
||||||
|
"language": event.get("language", current_lang),
|
||||||
|
"is_final": False,
|
||||||
|
})
|
||||||
|
|
||||||
|
elif event_type == "conversation.item.input_audio_transcription.completed":
|
||||||
|
# Final utterance completed
|
||||||
|
transcript = event.get("transcript", "")
|
||||||
|
if transcript and transcript.strip():
|
||||||
|
accumulated_text = build_display_text(accumulated_text, transcript)
|
||||||
|
current_lang = event.get("language", current_lang)
|
||||||
|
logger.info(
|
||||||
|
"dashscope-utterance lang=%s text_len=%d accumulated_len=%d",
|
||||||
|
current_lang, len(transcript), len(accumulated_text),
|
||||||
|
)
|
||||||
|
await client_ws.send_json({
|
||||||
|
"delta": "",
|
||||||
|
"full_text": _to_traditional(accumulated_text),
|
||||||
|
"language": current_lang,
|
||||||
|
"is_final": True,
|
||||||
|
})
|
||||||
|
|
||||||
|
read_task = asyncio.create_task(read_events())
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Main loop: receive Float32 PCM from browser → convert → send to DashScope
|
||||||
|
while True:
|
||||||
|
float32_bytes = await client_ws.receive_bytes()
|
||||||
|
s16_bytes = float32_to_s16le(float32_bytes)
|
||||||
|
audio_b64 = base64.b64encode(s16_bytes).decode("ascii")
|
||||||
|
conversation.append_audio(audio_b64)
|
||||||
|
except WebSocketDisconnect:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
read_task.cancel()
|
||||||
|
conversation.close()
|
||||||
|
logger.info(
|
||||||
|
"dashscope-session-closed text_len=%d",
|
||||||
|
len(accumulated_text),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── FastAPI App Setup ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
app = FastAPI()
|
||||||
|
|
||||||
|
@app.websocket("/ws/asr/{video_id}")
|
||||||
|
async def ws_asr_endpoint(websocket: WebSocket, video_id: str, language: str = "yue"):
|
||||||
|
"""WebSocket endpoint for real-time ASR.
|
||||||
|
|
||||||
|
Query params:
|
||||||
|
language: "yue" (Cantonese), "zh" (Mandarin), "en" (English), "auto"
|
||||||
|
"""
|
||||||
|
await websocket.accept()
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
logger.info("ws-connect video_id=%s lang=%s", video_id, language)
|
||||||
|
|
||||||
|
await ws_proxy_dashscope(websocket, loop, language)
|
||||||
|
|
||||||
|
logger.info("ws-disconnect video_id=%s", video_id)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Non-Streaming Fallback (POST) ──────────────────────────────────────────
|
||||||
|
|
||||||
|
from fastapi import UploadFile, File
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
# Sync client for non-streaming fallback
|
||||||
|
sync_client = OpenAI(
|
||||||
|
api_key=DASHSCOPE_API_KEY,
|
||||||
|
base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
|
||||||
|
)
|
||||||
|
|
||||||
|
@app.post("/api/v1/asr/transcribe")
|
||||||
|
async def transcribe_file(file: UploadFile = File(...), language: str = "yue"):
|
||||||
|
"""Non-streaming fallback: transcribe an uploaded audio file."""
|
||||||
|
audio = await file.read()
|
||||||
|
|
||||||
|
# Build data URL: data:;base64,<base64_audio>
|
||||||
|
audio_b64 = base64.b64encode(audio).decode()
|
||||||
|
data_url = f"data:;base64,{audio_b64}"
|
||||||
|
|
||||||
|
resp = sync_client.chat.completions.create(
|
||||||
|
model="qwen3-asr-flash",
|
||||||
|
messages=[{
|
||||||
|
"role": "user",
|
||||||
|
"content": [{
|
||||||
|
"type": "input_audio",
|
||||||
|
"input_audio": {"data": data_url},
|
||||||
|
}],
|
||||||
|
}],
|
||||||
|
extra_body={
|
||||||
|
"asr_options": {
|
||||||
|
"language": language if language != "auto" else None,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
result = resp.choices[0].message.content
|
||||||
|
return {
|
||||||
|
"text": _to_traditional(result),
|
||||||
|
"language": language,
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,244 @@
|
||||||
|
/**
|
||||||
|
* Reference: React/TypeScript adaptation of VIDEO AUDIO capture for Alibaba Cloud ASR.
|
||||||
|
*
|
||||||
|
* Two modes:
|
||||||
|
* A. Streaming (real-time): capture <video> element's audio during playback
|
||||||
|
* → WebSocket → backend → DashScope realtime → transcript in QueryInput
|
||||||
|
* B. Full Transcript (batch): POST /api/v1/video/{id}/transcribe
|
||||||
|
* → backend extracts audio via ffmpeg → DashScope non-streaming → full transcript
|
||||||
|
*
|
||||||
|
* Audio pipeline (streaming mode):
|
||||||
|
* <video> element → AudioContext.createMediaElementSource(video)
|
||||||
|
* → ScriptProcessor(4096, 1, 1) → Float32Array
|
||||||
|
* → WebSocket.send(float32Data.buffer) → Backend → DashScope
|
||||||
|
*
|
||||||
|
* IMPORTANT:
|
||||||
|
* - processor.connect(audioContext.destination) so audio still plays through speakers
|
||||||
|
* - No getUserMedia() needed (no microphone permission)
|
||||||
|
* - Full Transcript mode uses backend ffmpeg to extract audio server-side
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { useState, useRef, useCallback, useEffect } from 'react';
|
||||||
|
|
||||||
|
// ─── Types ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
interface ASRMessage {
|
||||||
|
delta: string;
|
||||||
|
full_text: string;
|
||||||
|
language: string;
|
||||||
|
is_final: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
type ASRStatus = 'idle' | 'connecting' | 'streaming' | 'disconnected' | 'error';
|
||||||
|
|
||||||
|
interface UseVideoASROptions {
|
||||||
|
videoId: string;
|
||||||
|
videoElement: HTMLVideoElement | null;
|
||||||
|
language?: string; // "yue" | "zh" | "en" | "auto"
|
||||||
|
onFinalTranscript?: (text: string) => void;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Hook: useVideoASR (Streaming Mode) ─────────────────────────────────────
|
||||||
|
|
||||||
|
export function useVideoASR({
|
||||||
|
videoId,
|
||||||
|
videoElement,
|
||||||
|
language = 'yue',
|
||||||
|
onFinalTranscript,
|
||||||
|
}: UseVideoASROptions) {
|
||||||
|
const [transcript, setTranscript] = useState('');
|
||||||
|
const [partialTranscript, setPartialTranscript] = useState('');
|
||||||
|
const [status, setStatus] = useState<ASRStatus>('idle');
|
||||||
|
const [isStreaming, setIsStreaming] = useState(false);
|
||||||
|
|
||||||
|
const wsRef = useRef<WebSocket | null>(null);
|
||||||
|
const audioContextRef = useRef<AudioContext | null>(null);
|
||||||
|
const processorRef = useRef<ScriptProcessorNode | null>(null);
|
||||||
|
const sourceRef = useRef<MediaElementAudioSourceNode | null>(null);
|
||||||
|
const isStreamingRef = useRef(false);
|
||||||
|
|
||||||
|
const getWSURL = useCallback(() => {
|
||||||
|
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
|
||||||
|
const host = window.location.host;
|
||||||
|
const langParam = language !== 'auto' ? `?language=${language}` : '';
|
||||||
|
return `${protocol}//${host}/ws/asr/${videoId}${langParam}`;
|
||||||
|
}, [videoId, language]);
|
||||||
|
|
||||||
|
const startStreaming = useCallback(() => {
|
||||||
|
if (!videoElement) {
|
||||||
|
console.error('No video element available');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
setStatus('connecting');
|
||||||
|
|
||||||
|
const audioContext = new AudioContext({ sampleRate: 16000 });
|
||||||
|
audioContextRef.current = audioContext;
|
||||||
|
|
||||||
|
const source = audioContext.createMediaElementSource(videoElement);
|
||||||
|
sourceRef.current = source;
|
||||||
|
|
||||||
|
const processor = audioContext.createScriptProcessor(4096, 1, 1);
|
||||||
|
processorRef.current = processor;
|
||||||
|
|
||||||
|
const ws = new WebSocket(getWSURL());
|
||||||
|
wsRef.current = ws;
|
||||||
|
|
||||||
|
ws.onopen = () => {
|
||||||
|
isStreamingRef.current = true;
|
||||||
|
setIsStreaming(true);
|
||||||
|
setStatus('streaming');
|
||||||
|
};
|
||||||
|
|
||||||
|
ws.onmessage = (e) => {
|
||||||
|
const msg: ASRMessage = JSON.parse(e.data);
|
||||||
|
setTranscript(msg.full_text);
|
||||||
|
setPartialTranscript(msg.is_final ? '' : msg.full_text);
|
||||||
|
if (msg.is_final && msg.full_text.trim()) {
|
||||||
|
onFinalTranscript?.(msg.full_text);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
ws.onerror = () => setStatus('error');
|
||||||
|
ws.onclose = () => {
|
||||||
|
isStreamingRef.current = false;
|
||||||
|
setIsStreaming(false);
|
||||||
|
setStatus('disconnected');
|
||||||
|
};
|
||||||
|
|
||||||
|
processor.onaudioprocess = (e) => {
|
||||||
|
if (!isStreamingRef.current) return;
|
||||||
|
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return;
|
||||||
|
const float32Data = e.inputBuffer.getChannelData(0);
|
||||||
|
wsRef.current.send(float32Data.buffer);
|
||||||
|
};
|
||||||
|
|
||||||
|
source.connect(processor);
|
||||||
|
processor.connect(audioContext.destination);
|
||||||
|
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Video audio capture failed:', err);
|
||||||
|
setStatus('error');
|
||||||
|
}
|
||||||
|
}, [videoElement, getWSURL, onFinalTranscript]);
|
||||||
|
|
||||||
|
const stopStreaming = useCallback(() => {
|
||||||
|
isStreamingRef.current = false;
|
||||||
|
setIsStreaming(false);
|
||||||
|
processorRef.current?.disconnect();
|
||||||
|
processorRef.current = null;
|
||||||
|
sourceRef.current?.disconnect();
|
||||||
|
sourceRef.current = null;
|
||||||
|
wsRef.current?.close();
|
||||||
|
wsRef.current = null;
|
||||||
|
audioContextRef.current?.close();
|
||||||
|
audioContextRef.current = null;
|
||||||
|
setStatus('idle');
|
||||||
|
setPartialTranscript('');
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
return () => {
|
||||||
|
isStreamingRef.current = false;
|
||||||
|
processorRef.current?.disconnect();
|
||||||
|
sourceRef.current?.disconnect();
|
||||||
|
wsRef.current?.close();
|
||||||
|
audioContextRef.current?.close();
|
||||||
|
};
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
if (!videoElement) return;
|
||||||
|
const onPlay = () => startStreaming();
|
||||||
|
const onPause = () => stopStreaming();
|
||||||
|
const onEnded = () => stopStreaming();
|
||||||
|
videoElement.addEventListener('play', onPlay);
|
||||||
|
videoElement.addEventListener('pause', onPause);
|
||||||
|
videoElement.addEventListener('ended', onEnded);
|
||||||
|
return () => {
|
||||||
|
videoElement.removeEventListener('play', onPlay);
|
||||||
|
videoElement.removeEventListener('pause', onPause);
|
||||||
|
videoElement.removeEventListener('ended', onEnded);
|
||||||
|
};
|
||||||
|
}, [videoElement, startStreaming, stopStreaming]);
|
||||||
|
|
||||||
|
return {
|
||||||
|
transcript,
|
||||||
|
partialTranscript,
|
||||||
|
status,
|
||||||
|
isStreaming,
|
||||||
|
startStreaming,
|
||||||
|
stopStreaming,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// ─── Hook: useFullTranscript (Batch Mode) ───────────────────────────────────
|
||||||
|
|
||||||
|
interface UseFullTranscriptOptions {
|
||||||
|
videoId: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function useFullTranscript({ videoId }: UseFullTranscriptOptions) {
|
||||||
|
const [fullTranscript, setFullTranscript] = useState('');
|
||||||
|
const [isLoading, setIsLoading] = useState(false);
|
||||||
|
const [error, setError] = useState<string | null>(null);
|
||||||
|
|
||||||
|
const requestFullTranscript = useCallback(async () => {
|
||||||
|
setIsLoading(true);
|
||||||
|
setError(null);
|
||||||
|
try {
|
||||||
|
const resp = await fetch(`/api/v1/video/${videoId}/transcribe`, {
|
||||||
|
method: 'POST',
|
||||||
|
});
|
||||||
|
if (!resp.ok) {
|
||||||
|
throw new Error(`Server returned ${resp.status}`);
|
||||||
|
}
|
||||||
|
const data = await resp.json();
|
||||||
|
setFullTranscript(data.text);
|
||||||
|
return data.text;
|
||||||
|
} catch (err) {
|
||||||
|
const msg = err instanceof Error ? err.message : 'Transcription failed';
|
||||||
|
setError(msg);
|
||||||
|
return null;
|
||||||
|
} finally {
|
||||||
|
setIsLoading(false);
|
||||||
|
}
|
||||||
|
}, [videoId]);
|
||||||
|
|
||||||
|
return { fullTranscript, isLoading, error, requestFullTranscript };
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// ─── Usage Example (in LTTPage.tsx) ─────────────────────────────────────────
|
||||||
|
//
|
||||||
|
// const videoRef = useRef<HTMLVideoElement>(null);
|
||||||
|
// const [currentVideoId, setCurrentVideoId] = useState<string | null>(null);
|
||||||
|
//
|
||||||
|
// // Streaming ASR (auto-starts on video play, stops on pause/end)
|
||||||
|
// const asr = useVideoASR({
|
||||||
|
// videoId: currentVideoId ?? '',
|
||||||
|
// videoElement: videoRef.current,
|
||||||
|
// language: 'yue',
|
||||||
|
// onFinalTranscript: (text) => {
|
||||||
|
// setQueryText(text); // into QueryInput
|
||||||
|
// },
|
||||||
|
// });
|
||||||
|
//
|
||||||
|
// // Full Transcript (manual button)
|
||||||
|
// const ft = useFullTranscript({ videoId: currentVideoId ?? '' });
|
||||||
|
//
|
||||||
|
// return (
|
||||||
|
// <>
|
||||||
|
// <video ref={videoRef} src={videoUrl} controls />
|
||||||
|
// <button onClick={ft.requestFullTranscript} disabled={ft.isLoading}>
|
||||||
|
// {ft.isLoading ? 'Transcribing...' : 'Full Transcript'}
|
||||||
|
// </button>
|
||||||
|
// <QueryInput
|
||||||
|
// value={queryText}
|
||||||
|
// onChange={setQueryText}
|
||||||
|
// onSubmit={handleQuerySubmit}
|
||||||
|
// partialText={asr.partialTranscript}
|
||||||
|
// />
|
||||||
|
// </>
|
||||||
|
// );
|
||||||
|
|
@ -0,0 +1,159 @@
|
||||||
|
<!--
|
||||||
|
Reference: Browser Audio Capture + WebSocket Streaming for Alibaba Cloud ASR.
|
||||||
|
|
||||||
|
Extracted from: /mnt/c/Users/woody/Documents/projects/voice input/backend/static/index.html
|
||||||
|
|
||||||
|
Architecture:
|
||||||
|
Browser mic → AudioContext (16kHz mono) → ScriptProcessor → Float32Array
|
||||||
|
→ WebSocket.send(float32Data.buffer) → FastAPI → DashScope
|
||||||
|
|
||||||
|
Key points:
|
||||||
|
- Must use HTTPS/WSS (Chrome blocks getUserMedia on HTTP)
|
||||||
|
- AudioContext sampleRate MUST be 16000
|
||||||
|
- ScriptProcessor buffer size: 4096 (lower = more frequent sends, lower latency)
|
||||||
|
- Send raw Float32 bytes (NOT base64, NOT WAV) to our backend WebSocket
|
||||||
|
- Backend handles Float32 → S16_LE → base64 conversion
|
||||||
|
- Language query param: ?language=yue (Cantonese), zh, en, auto
|
||||||
|
-->
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="zh-HK">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>ASR Reference - Audio Capture</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Alibaba Cloud ASR - Audio Capture Pattern</h1>
|
||||||
|
|
||||||
|
<!-- Language selector -->
|
||||||
|
<select id="langSelect">
|
||||||
|
<option value="auto">Auto-Detect</option>
|
||||||
|
<option value="en">English</option>
|
||||||
|
<option value="zh">Mandarin</option>
|
||||||
|
<option value="yue" selected>Cantonese</option>
|
||||||
|
</select>
|
||||||
|
|
||||||
|
<!-- Record toggle button -->
|
||||||
|
<button id="recordBtn">Start Recording</button>
|
||||||
|
|
||||||
|
<!-- Status indicator -->
|
||||||
|
<div id="status">Ready</div>
|
||||||
|
|
||||||
|
<!-- Transcript display -->
|
||||||
|
<div id="transcript"></div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
// ─── Configuration ──────────────────────────────────────────────────────────
|
||||||
|
const WS_PATH = '/ws/asr/session-1'; // video_id from URL or session
|
||||||
|
const WS_BASE = `${location.protocol === 'https:' ? 'wss:' : 'ws:'}//${location.host}${WS_PATH}`;
|
||||||
|
|
||||||
|
// ─── State ──────────────────────────────────────────────────────────────────
|
||||||
|
let ws = null;
|
||||||
|
let audioContext = null;
|
||||||
|
let processor = null; // ScriptProcessorNode
|
||||||
|
let stream = null; // MediaStream
|
||||||
|
let isRecording = false;
|
||||||
|
|
||||||
|
// ─── DOM Refs ───────────────────────────────────────────────────────────────
|
||||||
|
const recordBtn = document.getElementById('recordBtn');
|
||||||
|
const langSelect = document.getElementById('langSelect');
|
||||||
|
const statusEl = document.getElementById('status');
|
||||||
|
const transcriptEl = document.getElementById('transcript');
|
||||||
|
|
||||||
|
// ─── WebSocket URL Builder ──────────────────────────────────────────────────
|
||||||
|
function getWSURL() {
|
||||||
|
const lang = langSelect.value;
|
||||||
|
return lang !== 'auto' ? `${WS_BASE}?language=${lang}` : WS_BASE;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Status Helper ──────────────────────────────────────────────────────────
|
||||||
|
function setStatus(text) {
|
||||||
|
statusEl.textContent = text;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Audio Capture Setup ────────────────────────────────────────────────────
|
||||||
|
async function startRecording() {
|
||||||
|
try {
|
||||||
|
// Step 1: Get microphone access
|
||||||
|
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||||
|
|
||||||
|
// Step 2: Create AudioContext at EXACTLY 16kHz (required by DashScope)
|
||||||
|
audioContext = new AudioContext({ sampleRate: 16000 });
|
||||||
|
|
||||||
|
// Step 3: Create media stream source
|
||||||
|
const source = audioContext.createMediaStreamSource(stream);
|
||||||
|
|
||||||
|
// Step 4: Create ScriptProcessor for raw PCM access
|
||||||
|
// Buffer size 4096 → ~256ms chunks at 16kHz (lower = more responsive)
|
||||||
|
processor = audioContext.createScriptProcessor(4096, 1, 1);
|
||||||
|
|
||||||
|
// Step 5: Connect WebSocket
|
||||||
|
ws = new WebSocket(getWSURL());
|
||||||
|
|
||||||
|
ws.onopen = () => {
|
||||||
|
isRecording = true;
|
||||||
|
recordBtn.textContent = 'Stop Recording';
|
||||||
|
setStatus('Listening...');
|
||||||
|
};
|
||||||
|
|
||||||
|
ws.onmessage = (e) => {
|
||||||
|
const { full_text, language, is_final } = JSON.parse(e.data);
|
||||||
|
// Display transcript — in our React app, this goes into QueryInput
|
||||||
|
transcriptEl.textContent = full_text || '';
|
||||||
|
};
|
||||||
|
|
||||||
|
ws.onerror = () => setStatus('WebSocket error');
|
||||||
|
ws.onclose = () => {
|
||||||
|
isRecording = false;
|
||||||
|
recordBtn.textContent = 'Start Recording';
|
||||||
|
setStatus('Disconnected');
|
||||||
|
};
|
||||||
|
|
||||||
|
// Step 6: Audio processing callback — fires every ~256ms
|
||||||
|
processor.onaudioprocess = (e) => {
|
||||||
|
// Get raw Float32 samples from input channel 0
|
||||||
|
const float32Data = e.inputBuffer.getChannelData(0);
|
||||||
|
|
||||||
|
// Send raw Float32 bytes to backend (NOT base64, NOT WAV)
|
||||||
|
// The .buffer property gives us an ArrayBuffer of the Float32Array
|
||||||
|
if (ws && ws.readyState === WebSocket.OPEN && isRecording) {
|
||||||
|
ws.send(float32Data.buffer);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Step 7: Connect the audio graph
|
||||||
|
source.connect(processor);
|
||||||
|
processor.connect(audioContext.destination); // Required: prevents auto-mute
|
||||||
|
|
||||||
|
} catch (err) {
|
||||||
|
setStatus('Microphone access denied');
|
||||||
|
console.error('Audio capture error:', err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Stop Recording ─────────────────────────────────────────────────────────
|
||||||
|
function stopRecording() {
|
||||||
|
isRecording = false;
|
||||||
|
|
||||||
|
// Clean up audio graph
|
||||||
|
processor?.disconnect();
|
||||||
|
stream?.getTracks().forEach(t => t.stop());
|
||||||
|
|
||||||
|
// Close WebSocket
|
||||||
|
ws?.close();
|
||||||
|
|
||||||
|
recordBtn.textContent = 'Start Recording';
|
||||||
|
setStatus('Ready');
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Toggle Recording ───────────────────────────────────────────────────────
|
||||||
|
recordBtn.addEventListener('click', () => {
|
||||||
|
if (isRecording) {
|
||||||
|
stopRecording();
|
||||||
|
} else {
|
||||||
|
startRecording();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
|
@ -0,0 +1,321 @@
|
||||||
|
# Phase 2: Video Upload + Video Audio ASR → RAG — Implementation Plan
|
||||||
|
|
||||||
|
**Created:** 2026-05-06
|
||||||
|
**Updated:** 2026-05-06 (video audio capture via createMediaElementSource; Full Transcript batch mode)
|
||||||
|
**Status:** Planning — Not Started
|
||||||
|
**Depends on:** Phase 1 (Complete)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Overview
|
||||||
|
|
||||||
|
Phase 2 adds video upload/playback and ASR transcription of the **video's audio track** (not microphone). When the video plays, browser captures the video audio output and streams it to Alibaba Cloud DashScope for real-time transcription. A "Full Transcript" button sends the complete video audio for batch (non-streaming) transcription via backend ffmpeg extraction.
|
||||||
|
|
||||||
|
### Two ASR Modes
|
||||||
|
|
||||||
|
**Mode A — Streaming (real-time, auto on play):**
|
||||||
|
```
|
||||||
|
<video> → AudioContext.createMediaElementSource(video)
|
||||||
|
→ ScriptProcessor(4096, 1, 1) → Float32 PCM
|
||||||
|
→ WebSocket → FastAPI → DashScope realtime API
|
||||||
|
→ transcript JSON → QueryInput (in real time)
|
||||||
|
```
|
||||||
|
Auto-starts when video plays, stops on pause/seek/end. Partial transcript flows into QueryInput.
|
||||||
|
|
||||||
|
**Mode B — Full Transcript (batch, manual button):**
|
||||||
|
```
|
||||||
|
User clicks "Full Transcript" under video player
|
||||||
|
→ POST /api/v1/video/{id}/transcribe
|
||||||
|
→ Backend: ffmpeg extracts audio from uploaded video
|
||||||
|
→ DashScope OpenAI-compatible API (non-streaming)
|
||||||
|
→ Complete transcript of entire video → QueryInput
|
||||||
|
```
|
||||||
|
Server-side audio extraction via ffmpeg. No browser involvement.
|
||||||
|
|
||||||
|
### Changes From Previous Versions
|
||||||
|
|
||||||
|
| Aspect | V2 (mic capture) | V3 (video audio capture) |
|
||||||
|
|---|---|---|
|
||||||
|
| Audio source | getUserMedia() microphone | createMediaElementSource(video) |
|
||||||
|
| Trigger | Manual record button | Auto on video play, stop on pause/end |
|
||||||
|
| Permissions | Microphone permission required | None |
|
||||||
|
| Batch mode | No | Yes — "Full Transcript" button |
|
||||||
|
| Backend ffmpeg | Not needed | For Full Transcript mode |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. User Flow
|
||||||
|
|
||||||
|
1. User uploads video → appears in left panel player
|
||||||
|
2. User presses play → browser captures video audio → streams to DashScope → transcript in QueryInput
|
||||||
|
3. User pauses/seeks/ends → streaming stops, accumulated transcript stays in QueryInput
|
||||||
|
4. User edits transcript in QueryInput and clicks Submit → Phase 1 RAG pipeline
|
||||||
|
5. **Full Transcript**: clicks "Full Transcript" button → server extracts audio → batch ASR → complete transcript fills QueryInput
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Sub-Phases
|
||||||
|
|
||||||
|
### Phase 2.1 — Configuration & Infrastructure Setup (0.5 day)
|
||||||
|
|
||||||
|
Config fields, directory structure, service/router/model skeletons, register routers.
|
||||||
|
|
||||||
|
**Test:** `test_phase2_config.py`
|
||||||
|
|
||||||
|
**Tasks:**
|
||||||
|
| # | Task | File |
|
||||||
|
|---|------|------|
|
||||||
|
| 2.1.1 | Add 6 config fields: `dashscope_api_key`, `asr_model_name`, `asr_realtime_model_name`, `video_upload_dir`, `max_video_size_mb`, `supported_video_formats` | `core/config.py` |
|
||||||
|
| 2.1.2 | Update `.env.example` | `.env.example` |
|
||||||
|
| 2.1.3 | Add deps: `dashscope>=0.4.0`, `openai>=1.52.0`, `zhconv>=1.4.0`, `python-multipart`, `aiofiles` | `requirements.txt` |
|
||||||
|
| 2.1.4 | Create `models/video.py` — `VideoUploadResponse`, `FullTranscriptResponse` | `models/video.py` |
|
||||||
|
| 2.1.5 | Create `models/asr.py` — `ASRTranscriptEvent` | `models/asr.py` |
|
||||||
|
| 2.1.6 | Create `services/video_service.py`, `services/asr_client.py` stubs | `services/` |
|
||||||
|
| 2.1.7 | Create `routers/video.py` stub: `POST /upload`, `GET /{id}`, `POST /{id}/transcribe` | `routers/video.py` |
|
||||||
|
| 2.1.8 | Create `routers/ws_asr.py` stub: `WS /ws/asr/{video_id}?language=yue` | `routers/ws_asr.py` |
|
||||||
|
| 2.1.9 | Register routers in `main.py` | `main.py` |
|
||||||
|
| 2.1.10 | Write and pass `test_phase2_config.py` | `app/test/` |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 2.2 — Video Upload Backend (0.5 day)
|
||||||
|
|
||||||
|
Streaming upload with size/format validation. Reuses `routers/ingest.py` pattern.
|
||||||
|
|
||||||
|
**Test:** `test_phase2_video_upload.py` (implement 4 existing stubs)
|
||||||
|
|
||||||
|
**Tasks:**
|
||||||
|
| # | Task | File |
|
||||||
|
|---|------|------|
|
||||||
|
| 2.2.1 | Write tests — implement all 4 stubs | `test_phase2_video_upload.py` |
|
||||||
|
| 2.2.2 | Implement `VideoService.validate_video()`, `save_video()` (streaming, aiofiles) | `services/video_service.py` |
|
||||||
|
| 2.2.3 | Implement `VideoService.get_video_path()`, `delete_video()` | `services/video_service.py` |
|
||||||
|
| 2.2.4 | Implement `POST /api/v1/video/upload` route | `routers/video.py` |
|
||||||
|
| 2.2.5 | Implement `GET /api/v1/video/{video_id}` route (FileResponse) | `routers/video.py` |
|
||||||
|
| 2.2.6 | Run tests → pass → commit | — |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 2.3 — ASR WebSocket Proxy + Full Transcript Backend (1 day)
|
||||||
|
|
||||||
|
Two backend ASR paths: real-time streaming (WebSocket proxy to DashScope) and batch (ffmpeg extract → DashScope non-streaming API).
|
||||||
|
|
||||||
|
**Reference:** `.examples/alibaba_asr_backend.py`
|
||||||
|
|
||||||
|
**Tests:** `test_phase2_asr_client.py` (3 stubs), `test_phase2_ws_asr.py` (3 stubs), `test_phase2_ws_protocol.py` (new), `test_phase2_full_transcript.py` (new)
|
||||||
|
|
||||||
|
**Acceptance Criteria:**
|
||||||
|
- WebSocket `/ws/asr/{video_id}?language=yue` → Float32 PCM → S16_LE base64 → DashScope realtime
|
||||||
|
- `transcription.text` events → `{"full_text": "...", "is_final": false}` to browser
|
||||||
|
- `transcription.completed` events → `{"full_text": "...", "is_final": true}` to browser
|
||||||
|
- Language: `yue` (Cantonese), `zh`, `en`, `auto`
|
||||||
|
- Traditional Chinese via `zhconv`
|
||||||
|
- `POST /api/v1/video/{video_id}/transcribe` → ffmpeg extract audio → DashScope batch → `{"text": "..."}`
|
||||||
|
- `DASHSCOPE_API_KEY` not set → clear error
|
||||||
|
- Client disconnect → DashScope session closed cleanly
|
||||||
|
|
||||||
|
**Tasks:**
|
||||||
|
| # | Task | File |
|
||||||
|
|---|------|------|
|
||||||
|
| 2.3.1 | Write tests first | `app/test/` |
|
||||||
|
| 2.3.2 | `float32_to_s16le()`, `build_display_text()`, `_to_traditional()` | `services/asr_client.py` |
|
||||||
|
| 2.3.3 | `DashScopeCallback` (sync SDK → asyncio.Queue bridge) + `_ws_proxy_dashscope()` | `routers/ws_asr.py` |
|
||||||
|
| 2.3.4 | WebSocket endpoint | `routers/ws_asr.py` |
|
||||||
|
| 2.3.5 | `VideoService.extract_audio()` — ffmpeg async subprocess: PCM16LE 16kHz mono | `services/video_service.py` |
|
||||||
|
| 2.3.6 | `ASRClient.transcribe_full()` — batch: WAV → DashScope OpenAI-compatible API | `services/asr_client.py` |
|
||||||
|
| 2.3.7 | `POST /api/v1/video/{video_id}/transcribe` route | `routers/video.py` |
|
||||||
|
| 2.3.8 | Enhance `conftest.py` mock_asr_client | `conftest.py` |
|
||||||
|
| 2.3.9 | Run tests → pass → commit | — |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 2.4 — Transcript → QueryInput + Full Transcript Button (0.5 day)
|
||||||
|
|
||||||
|
Wire up real-time transcript from streaming ASR into QueryInput. Full Transcript button wiring.
|
||||||
|
|
||||||
|
**Tests:** `test_phase2_useVideoASR.test.ts`, `test_phase2_useFullTranscript.test.ts`, `test_phase2_QueryInput_integration.test.tsx`
|
||||||
|
|
||||||
|
**Acceptance Criteria:**
|
||||||
|
- `useVideoASR` hook: auto-starts on video `play`, stops on `pause`/`ended`
|
||||||
|
- `useVideoASR` exposes `transcript`, `partialTranscript`, `isStreaming`, `status`
|
||||||
|
- `useFullTranscript` hook: `requestFullTranscript()` → loading → transcript → error
|
||||||
|
- QueryInput shows transcript (grey italic = partial, black = final)
|
||||||
|
- QueryInput accepts `partialText` prop
|
||||||
|
|
||||||
|
**Tasks:**
|
||||||
|
| # | Task | File |
|
||||||
|
|---|------|------|
|
||||||
|
| 2.4.1 | Write tests first | `src/test/` |
|
||||||
|
| 2.4.2 | Create `hooks/useVideoASR.ts` (see `.examples/alibaba_asr_frontend_react.tsx`) | `hooks/useVideoASR.ts` |
|
||||||
|
| 2.4.3 | Create `hooks/useFullTranscript.ts` | `hooks/useFullTranscript.ts` |
|
||||||
|
| 2.4.4 | Update `types/index.ts` — `ASRMessage`, `ASRStatus`, `FullTranscriptResponse` | `types/index.ts` |
|
||||||
|
| 2.4.5 | Update `QueryInput.tsx` — add `partialText` prop | `components/QueryInput.tsx` |
|
||||||
|
| 2.4.6 | Run tests → pass → commit | — |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 2.5 — Frontend: Video Player + Buttons + Layout (1.5 days)
|
||||||
|
|
||||||
|
Replace `VideoPlaceholder` with video upload + player. ASR auto on play. Full Transcript button.
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────┬──────────────────────────┐
|
||||||
|
│ VideoUpload / │ QueryInput │ ← Upper Panel (30%)
|
||||||
|
│ VideoPlayer │ (transcript flows here │
|
||||||
|
│ │ from video audio ASR) │
|
||||||
|
│ [Full Transcript] │ [Submit] │
|
||||||
|
├─────────────────────┴──────────────────────────┤
|
||||||
|
│ ResponsePanel │ ← Lower Panel (70%)
|
||||||
|
└────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
**Tests:** `test_phase2_VideoUpload.test.tsx`, `test_phase2_VideoPlayer.test.tsx`, `test_phase2_LTTPage_integration.test.tsx`
|
||||||
|
|
||||||
|
**Acceptance Criteria:**
|
||||||
|
- Drag-and-drop video upload with progress bar (native HTML5)
|
||||||
|
- Video player with native `<video controls>` exposing ref
|
||||||
|
- ASR auto on play → transcript in QueryInput; stops on pause/end
|
||||||
|
- "Full Transcript" button → loading spinner → fills QueryInput with full transcript
|
||||||
|
- Error states: upload fails, ASR fails, Full Transcript fails → clear messages
|
||||||
|
|
||||||
|
**Tasks:**
|
||||||
|
| # | Task | File |
|
||||||
|
|---|------|------|
|
||||||
|
| 2.5.1 | Write all 3 tests first | `src/test/` |
|
||||||
|
| 2.5.2 | Create `VideoUpload.tsx` — native drag-drop, axios progress | `components/VideoUpload.tsx` |
|
||||||
|
| 2.5.3 | Create `VideoPlayer.tsx` — native `<video controls>`, forwardRef | `components/VideoPlayer.tsx` |
|
||||||
|
| 2.5.4 | Update `types/index.ts` | `types/index.ts` |
|
||||||
|
| 2.5.5 | Update `lib/api.ts` — `uploadVideo()`, `getVideoUrl()`, `requestFullTranscript()` | `lib/api.ts` |
|
||||||
|
| 2.5.6 | Update `lib/queries.tsx` — `useVideoUpload()` | `lib/queries.tsx` |
|
||||||
|
| 2.5.7 | Refactor `LTTPage.tsx` — replace VideoPlaceholder, wire hooks + QueryInput | `pages/LTTPage.tsx` |
|
||||||
|
| 2.5.8 | Update `QueryInput.tsx` — transcript value + partial text styling | `components/QueryInput.tsx` |
|
||||||
|
| 2.5.9 | Run tests → pass → commit | — |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 2.6 — Integration & Acceptance Testing (1 day)
|
||||||
|
|
||||||
|
**Tests:** `test_integration_phase2.py`, `test_acceptance_phase2_video.py`, `test_acceptance_phase2_asr.py`, `test_acceptance_integration_phase2.py`
|
||||||
|
|
||||||
|
**Tasks:**
|
||||||
|
| # | Task |
|
||||||
|
|---|------|
|
||||||
|
| 2.6.1 | Implement integration test (mocked DashScope, real ChromaDB + file I/O) |
|
||||||
|
| 2.6.2 | Implement acceptance: real video upload + Full Transcript |
|
||||||
|
| 2.6.3 | Implement acceptance: real DashScope streaming + batch |
|
||||||
|
| 2.6.4 | Implement E2E acceptance |
|
||||||
|
| 2.6.5 | Full regression run |
|
||||||
|
| 2.6.6 | Fix failures, final commit |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 2.7 — Polish & Deployment (0.5 day)
|
||||||
|
|
||||||
|
| # | Task |
|
||||||
|
|---|------|
|
||||||
|
| 2.7.1 | Structured logging for DashScope proxy + full transcript events |
|
||||||
|
| 2.7.2 | Update `nginx.conf` — `client_max_body_size` 350M |
|
||||||
|
| 2.7.3 | Verify production build |
|
||||||
|
| 2.7.4 | Update `README.md` |
|
||||||
|
| 2.7.5 | Final commit |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Timeline
|
||||||
|
|
||||||
|
| Sub-Phase | Description | Effort | Depends On |
|
||||||
|
|---|---|---|---|
|
||||||
|
| 2.1 | Config & Infrastructure | 0.5 day | — |
|
||||||
|
| 2.2 | Video Upload Backend | 0.5 day | 2.1 |
|
||||||
|
| 2.3 | ASR Proxy + Full Transcript | 1 day | 2.1 |
|
||||||
|
| 2.4 | Transcript → QueryInput | 0.5 day | 2.3 |
|
||||||
|
| 2.5 | Frontend: Layout + Buttons | 1.5 days | 2.2, 2.3 |
|
||||||
|
| 2.6 | Integration & Acceptance | 1 day | 2.4, 2.5 |
|
||||||
|
| 2.7 | Polish & Deployment | 0.5 day | 2.6 |
|
||||||
|
| **Total** | | **5.5 days** | |
|
||||||
|
|
||||||
|
2.2 (upload) and 2.3 (ASR) run concurrently.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Dependencies
|
||||||
|
|
||||||
|
**Backend:** `dashscope>=0.4.0`, `openai>=1.52.0`, `zhconv>=1.4.0`, `python-multipart`, `aiofiles`
|
||||||
|
**Frontend:** None (native Web APIs: `AudioContext.createMediaElementSource`, `ScriptProcessorNode`, `<video>`, HTML5 drag-and-drop)
|
||||||
|
**System:** ffmpeg on server (for Full Transcript audio extraction)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Config Fields
|
||||||
|
|
||||||
|
```python
|
||||||
|
dashscope_api_key: str = ""
|
||||||
|
asr_model_name: str = "qwen3-asr-flash" # Batch API
|
||||||
|
asr_realtime_model_name: str = "qwen3-asr-flash-realtime" # Streaming
|
||||||
|
video_upload_dir: str = "./uploads"
|
||||||
|
max_video_size_mb: int = 300
|
||||||
|
supported_video_formats: list[str] = [".mp4", ".webm", ".mov", ".avi", ".mkv"]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Key Design Decisions
|
||||||
|
|
||||||
|
| Decision | Choice | Why |
|
||||||
|
|---|---|---|
|
||||||
|
| Audio source | `createMediaElementSource(video)` | Captures video audio during playback. No mic permission. |
|
||||||
|
| ASR auto trigger | Video `play` event | Transcript appears as user watches. Natural UX. |
|
||||||
|
| ASR stop trigger | Video `pause`/`ended` events | Clean lifecycle. New session on next play/seek. |
|
||||||
|
| Full Transcript | Manual button + server ffmpeg | User explicitly requests. Server has the file. |
|
||||||
|
| Full Transcript ASR | DashScope OpenAI-compatible API | Standard `/v1/chat/completions` with `input_audio`. WAV format. |
|
||||||
|
| ASR streaming | DashScope realtime SDK | `OmniRealtimeConversation` + callback → asyncio.Queue bridge |
|
||||||
|
| Transcript display | QueryInput textarea | Editable. Same box for typing or ASR output. |
|
||||||
|
| SSL | Required | Chrome may block `createMediaElementSource` without secure context. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. File Manifest
|
||||||
|
|
||||||
|
### New Files
|
||||||
|
```
|
||||||
|
backend/
|
||||||
|
app/routers/video.py
|
||||||
|
app/routers/ws_asr.py
|
||||||
|
app/services/video_service.py
|
||||||
|
app/services/asr_client.py
|
||||||
|
app/models/video.py
|
||||||
|
app/models/asr.py
|
||||||
|
app/test/test_phase2_config.py
|
||||||
|
app/test/test_phase2_ws_protocol.py
|
||||||
|
app/test/test_phase2_full_transcript.py
|
||||||
|
app/test/test_phase2_transcript_to_rag.py
|
||||||
|
|
||||||
|
frontend/src/
|
||||||
|
components/VideoUpload.tsx
|
||||||
|
components/VideoPlayer.tsx
|
||||||
|
hooks/useVideoASR.ts
|
||||||
|
hooks/useFullTranscript.ts
|
||||||
|
test/test_phase2_VideoUpload.test.tsx
|
||||||
|
test/test_phase2_VideoPlayer.test.tsx
|
||||||
|
test/test_phase2_useVideoASR.test.ts
|
||||||
|
test/test_phase2_useFullTranscript.test.ts
|
||||||
|
test/test_phase2_QueryInput_integration.test.tsx
|
||||||
|
test/test_phase2_LTTPage_integration.test.tsx
|
||||||
|
```
|
||||||
|
|
||||||
|
### Modified Files
|
||||||
|
```
|
||||||
|
backend/app/core/config.py, main.py, test/conftest.py, .env.example, requirements.txt
|
||||||
|
frontend/src/pages/LTTPage.tsx, components/QueryInput.tsx, lib/api.ts, lib/queries.tsx, types/index.ts
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. Reference Code (`.examples/`)
|
||||||
|
|
||||||
|
| File | Content |
|
||||||
|
|---|---|
|
||||||
|
| `alibaba_asr_backend.py` | DashScope WebSocket proxy + non-streaming fallback (FastAPI) |
|
||||||
|
| `alibaba_asr_frontend_vanilla.html` | Browser audio capture (vanilla JS, original) |
|
||||||
|
| `alibaba_asr_frontend_react.tsx` | React/TS: `useVideoASR` (streaming) + `useFullTranscript` (batch) hooks |
|
||||||
|
| `README.md` | Architecture overview + dependency notes |
|
||||||
|
|
@ -26,3 +26,13 @@ PROMPTS_DB_PATH=./data/prompts.db
|
||||||
HISTORY_DB_PATH=./data/history.db
|
HISTORY_DB_PATH=./data/history.db
|
||||||
|
|
||||||
CORS_ORIGINS=["http://localhost:5173","http://localhost:3000"]
|
CORS_ORIGINS=["http://localhost:5173","http://localhost:3000"]
|
||||||
|
|
||||||
|
# Alibaba Cloud DashScope ASR (Phase 2)
|
||||||
|
# Get your key from: https://modelstudio.console.alibabacloud.com
|
||||||
|
DASHSCOPE_API_KEY=sk-your-dashscope-key-here
|
||||||
|
ASR_MODEL_NAME=qwen3-asr-flash
|
||||||
|
ASR_REALTIME_MODEL_NAME=qwen3-asr-flash-realtime
|
||||||
|
|
||||||
|
# Video upload (Phase 2)
|
||||||
|
VIDEO_UPLOAD_DIR=./uploads
|
||||||
|
MAX_VIDEO_SIZE_MB=300
|
||||||
|
|
|
||||||
|
|
@ -44,6 +44,16 @@ class Settings(BaseSettings):
|
||||||
relevance_threshold: float = 7.0
|
relevance_threshold: float = 7.0
|
||||||
llm_timeout: float = 60.0
|
llm_timeout: float = 60.0
|
||||||
|
|
||||||
|
# Alibaba Cloud DashScope ASR (Phase 2)
|
||||||
|
dashscope_api_key: str = ""
|
||||||
|
asr_model_name: str = "qwen3-asr-flash"
|
||||||
|
asr_realtime_model_name: str = "qwen3-asr-flash-realtime"
|
||||||
|
|
||||||
|
# Video upload (Phase 2)
|
||||||
|
video_upload_dir: str = "./uploads"
|
||||||
|
max_video_size_mb: int = 300
|
||||||
|
supported_video_formats: list[str] = [".mp4", ".webm", ".mov", ".avi", ".mkv"]
|
||||||
|
|
||||||
# Development helpers
|
# Development helpers
|
||||||
model_config = {"env_file": ".env", "env_file_encoding": "utf-8"}
|
model_config = {"env_file": ".env", "env_file_encoding": "utf-8"}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ from fastapi import FastAPI
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from fastapi.responses import FileResponse
|
from fastapi.responses import FileResponse
|
||||||
|
|
||||||
from app.routers import ingest, query, documents, prompts, history, chunks
|
from app.routers import ingest, query, documents, prompts, history, chunks, video, ws_asr
|
||||||
from app.core.config import get_settings
|
from app.core.config import get_settings
|
||||||
from app.core.sqlite_db import (
|
from app.core.sqlite_db import (
|
||||||
get_prompts_db,
|
get_prompts_db,
|
||||||
|
|
@ -56,6 +56,8 @@ app.include_router(documents.router, prefix="/api/v1")
|
||||||
app.include_router(prompts.router)
|
app.include_router(prompts.router)
|
||||||
app.include_router(history.router)
|
app.include_router(history.router)
|
||||||
app.include_router(chunks.router)
|
app.include_router(chunks.router)
|
||||||
|
app.include_router(video.router, prefix="/api/v1")
|
||||||
|
app.include_router(ws_asr.router)
|
||||||
|
|
||||||
_prompts_conn = get_prompts_db()
|
_prompts_conn = get_prompts_db()
|
||||||
init_prompts_db(_prompts_conn)
|
init_prompts_db(_prompts_conn)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,8 @@
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class ASRTranscriptEvent(BaseModel):
|
||||||
|
delta: str = ""
|
||||||
|
full_text: str
|
||||||
|
language: str
|
||||||
|
is_final: bool
|
||||||
|
|
@ -0,0 +1,21 @@
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class VideoUploadResponse(BaseModel):
|
||||||
|
video_id: str
|
||||||
|
filename: str
|
||||||
|
size_bytes: int
|
||||||
|
url: str
|
||||||
|
|
||||||
|
|
||||||
|
class VideoInfo(BaseModel):
|
||||||
|
video_id: str
|
||||||
|
filename: str
|
||||||
|
size_bytes: int
|
||||||
|
upload_date: str
|
||||||
|
|
||||||
|
|
||||||
|
class FullTranscriptResponse(BaseModel):
|
||||||
|
text: str
|
||||||
|
language: str
|
||||||
|
duration_seconds: float | None = None
|
||||||
|
|
@ -0,0 +1,77 @@
|
||||||
|
import logging
|
||||||
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import aiofiles
|
||||||
|
from fastapi import APIRouter, UploadFile, File, HTTPException
|
||||||
|
from fastapi.responses import FileResponse
|
||||||
|
|
||||||
|
from app.models.video import VideoUploadResponse
|
||||||
|
from app.services.video_service import VideoService
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
router = APIRouter(tags=["video"])
|
||||||
|
|
||||||
|
|
||||||
|
def _get_video_service() -> VideoService:
|
||||||
|
from app.core.config import get_settings
|
||||||
|
|
||||||
|
s = get_settings()
|
||||||
|
return VideoService(
|
||||||
|
upload_dir=s.video_upload_dir,
|
||||||
|
max_size_mb=s.max_video_size_mb,
|
||||||
|
supported_formats=s.supported_video_formats,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/video/upload", response_model=VideoUploadResponse)
|
||||||
|
async def upload_video(file: UploadFile = File(...)):
|
||||||
|
service = _get_video_service()
|
||||||
|
filename = file.filename or "unknown"
|
||||||
|
ext = Path(filename).suffix.lower()
|
||||||
|
|
||||||
|
total_size = 0
|
||||||
|
video_id = uuid.uuid4().hex[:12]
|
||||||
|
dest_path = service.upload_dir / f"{video_id}{ext}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with aiofiles.open(dest_path, "wb") as out:
|
||||||
|
while chunk := await file.read(1024 * 1024):
|
||||||
|
total_size += len(chunk)
|
||||||
|
if total_size > service.max_size_bytes:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=413,
|
||||||
|
detail=f"File exceeds {service.max_size_bytes // 1024 // 1024}MB limit",
|
||||||
|
)
|
||||||
|
await out.write(chunk)
|
||||||
|
except HTTPException:
|
||||||
|
dest_path.unlink(missing_ok=True)
|
||||||
|
raise
|
||||||
|
except Exception:
|
||||||
|
dest_path.unlink(missing_ok=True)
|
||||||
|
raise HTTPException(status_code=500, detail="Upload failed")
|
||||||
|
|
||||||
|
service.validate_video(filename, file.content_type, total_size)
|
||||||
|
logger.info("Video uploaded: id=%s filename=%s size=%d", video_id, filename, total_size)
|
||||||
|
|
||||||
|
return VideoUploadResponse(
|
||||||
|
video_id=video_id,
|
||||||
|
filename=filename,
|
||||||
|
size_bytes=total_size,
|
||||||
|
url=f"/api/v1/video/{video_id}",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/video/{video_id}")
|
||||||
|
async def serve_video(video_id: str):
|
||||||
|
service = _get_video_service()
|
||||||
|
video_path = service.get_video_path(video_id)
|
||||||
|
ext = video_path.suffix.lower()
|
||||||
|
media_types = {
|
||||||
|
".mp4": "video/mp4",
|
||||||
|
".webm": "video/webm",
|
||||||
|
".mov": "video/quicktime",
|
||||||
|
".avi": "video/x-msvideo",
|
||||||
|
".mkv": "video/x-matroska",
|
||||||
|
}
|
||||||
|
return FileResponse(str(video_path), media_type=media_types.get(ext, "application/octet-stream"))
|
||||||
|
|
@ -0,0 +1,16 @@
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from fastapi import APIRouter, WebSocket, WebSocketDisconnect
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
router = APIRouter(tags=["asr"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.websocket("/ws/asr/{video_id}")
|
||||||
|
async def ws_asr_endpoint(websocket: WebSocket, video_id: str, language: str = "yue"):
|
||||||
|
await websocket.accept()
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
await websocket.receive_bytes()
|
||||||
|
except WebSocketDisconnect:
|
||||||
|
pass
|
||||||
|
|
@ -0,0 +1,25 @@
|
||||||
|
import struct
|
||||||
|
import base64
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def float32_to_s16le(float32_bytes: bytes) -> bytes:
|
||||||
|
num_samples = len(float32_bytes) // 4
|
||||||
|
floats = struct.unpack(f"<{num_samples}f", float32_bytes)
|
||||||
|
int16_samples = [max(-32768, min(32767, int(f * 32767.0))) for f in floats]
|
||||||
|
return struct.pack(f"<{num_samples}h", *int16_samples)
|
||||||
|
|
||||||
|
|
||||||
|
def build_display_text(accumulated: str, current: str) -> str:
|
||||||
|
parts = [p for p in (accumulated, current) if p and p.strip()]
|
||||||
|
return " ".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
class ASRClient:
|
||||||
|
def __init__(self, settings):
|
||||||
|
self.settings = settings
|
||||||
|
|
||||||
|
async def transcribe_full(self, audio_bytes: bytes, language: str = "yue") -> str:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
@ -0,0 +1,35 @@
|
||||||
|
from pathlib import Path
|
||||||
|
from fastapi import HTTPException
|
||||||
|
|
||||||
|
|
||||||
|
class VideoService:
|
||||||
|
def __init__(self, upload_dir: str, max_size_mb: int, supported_formats: list[str]):
|
||||||
|
self.upload_dir = Path(upload_dir)
|
||||||
|
self.max_size_bytes = max_size_mb * 1024 * 1024
|
||||||
|
self.supported_formats = supported_formats
|
||||||
|
self.upload_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
def validate_video(self, filename: str | None, content_type: str | None, size_bytes: int) -> None:
|
||||||
|
if not filename:
|
||||||
|
raise HTTPException(status_code=400, detail="No file selected")
|
||||||
|
ext = Path(filename).suffix.lower()
|
||||||
|
if ext not in self.supported_formats:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Unsupported format: {ext}. Supported: {', '.join(self.supported_formats)}",
|
||||||
|
)
|
||||||
|
if size_bytes > self.max_size_bytes:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=413,
|
||||||
|
detail=f"File exceeds {self.max_size_bytes // 1024 // 1024}MB limit",
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_video_path(self, video_id: str) -> Path:
|
||||||
|
candidates = list(self.upload_dir.glob(f"{video_id}.*"))
|
||||||
|
if not candidates:
|
||||||
|
raise HTTPException(status_code=404, detail="Video not found")
|
||||||
|
return candidates[0]
|
||||||
|
|
||||||
|
def delete_video(self, video_id: str) -> None:
|
||||||
|
for p in self.upload_dir.glob(f"{video_id}.*"):
|
||||||
|
p.unlink()
|
||||||
|
|
@ -0,0 +1,63 @@
|
||||||
|
"""Phase 2 config tests: ASR and video settings."""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_config_asr_defaults(monkeypatch, tmp_path):
|
||||||
|
monkeypatch.delenv("DASHSCOPE_API_KEY", raising=False)
|
||||||
|
monkeypatch.delenv("ASR_MODEL_NAME", raising=False)
|
||||||
|
monkeypatch.delenv("ASR_REALTIME_MODEL_NAME", raising=False)
|
||||||
|
monkeypatch.setenv("LLM_API_KEY", "sk-test")
|
||||||
|
monkeypatch.setenv("DP_API_KEY", "sk-test")
|
||||||
|
monkeypatch.setenv("EMBEDDING_API_KEY", "sk-test")
|
||||||
|
env_file = tmp_path / ".env"
|
||||||
|
env_file.write_text("")
|
||||||
|
monkeypatch.chdir(tmp_path)
|
||||||
|
|
||||||
|
from app.core.config import Settings, get_settings
|
||||||
|
get_settings.cache_clear()
|
||||||
|
settings = Settings(_env_file=())
|
||||||
|
assert settings.dashscope_api_key == ""
|
||||||
|
assert settings.asr_model_name == "qwen3-asr-flash"
|
||||||
|
assert settings.asr_realtime_model_name == "qwen3-asr-flash-realtime"
|
||||||
|
|
||||||
|
|
||||||
|
def test_config_video_defaults(monkeypatch, tmp_path):
|
||||||
|
monkeypatch.delenv("VIDEO_UPLOAD_DIR", raising=False)
|
||||||
|
monkeypatch.delenv("MAX_VIDEO_SIZE_MB", raising=False)
|
||||||
|
monkeypatch.setenv("LLM_API_KEY", "sk-test")
|
||||||
|
monkeypatch.setenv("DP_API_KEY", "sk-test")
|
||||||
|
monkeypatch.setenv("EMBEDDING_API_KEY", "sk-test")
|
||||||
|
env_file = tmp_path / ".env"
|
||||||
|
env_file.write_text("")
|
||||||
|
monkeypatch.chdir(tmp_path)
|
||||||
|
|
||||||
|
from app.core.config import Settings, get_settings
|
||||||
|
get_settings.cache_clear()
|
||||||
|
settings = Settings(_env_file=())
|
||||||
|
assert settings.video_upload_dir == "./uploads"
|
||||||
|
assert settings.max_video_size_mb == 300
|
||||||
|
assert ".mp4" in settings.supported_video_formats
|
||||||
|
|
||||||
|
|
||||||
|
def test_config_loads_from_env(tmp_path, monkeypatch):
|
||||||
|
env_file = tmp_path / ".env"
|
||||||
|
env_file.write_text(
|
||||||
|
"DASHSCOPE_API_KEY=sk-test-key\n"
|
||||||
|
"ASR_MODEL_NAME=qwen3-asr-flash\n"
|
||||||
|
"ASR_REALTIME_MODEL_NAME=qwen3-asr-flash-realtime\n"
|
||||||
|
"VIDEO_UPLOAD_DIR=./test_uploads\n"
|
||||||
|
"MAX_VIDEO_SIZE_MB=500\n"
|
||||||
|
"LLM_API_KEY=sk-test\n"
|
||||||
|
"DP_API_KEY=sk-test\n"
|
||||||
|
"EMBEDDING_API_KEY=sk-test\n"
|
||||||
|
)
|
||||||
|
monkeypatch.chdir(tmp_path)
|
||||||
|
from app.core.config import Settings, get_settings
|
||||||
|
get_settings.cache_clear()
|
||||||
|
|
||||||
|
settings = Settings()
|
||||||
|
assert settings.dashscope_api_key == "sk-test-key"
|
||||||
|
assert settings.asr_model_name == "qwen3-asr-flash"
|
||||||
|
assert settings.asr_realtime_model_name == "qwen3-asr-flash-realtime"
|
||||||
|
assert settings.video_upload_dir == "./test_uploads"
|
||||||
|
assert settings.max_video_size_mb == 500
|
||||||
|
|
@ -1,29 +1,84 @@
|
||||||
"""Phase 2 tests: Video upload endpoint.
|
"""Phase 2 tests: Video upload endpoint.
|
||||||
|
|
||||||
Covers:
|
Covers:
|
||||||
- POST /api/v1/upload-video with size validation (<300MB)
|
- POST /api/v1/video/upload with size validation (<300MB)
|
||||||
- Format validation (MP4 and common formats)
|
- Format validation (MP4 and common formats)
|
||||||
- Static file serving
|
- Static file serving
|
||||||
- Error handling for oversized/invalid files
|
- Error handling for oversized/invalid files
|
||||||
"""
|
"""
|
||||||
import pytest
|
import pytest
|
||||||
|
from fastapi import FastAPI
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def video_client(tmp_path, monkeypatch):
|
||||||
|
upload_dir = tmp_path / "test_uploads"
|
||||||
|
upload_dir.mkdir()
|
||||||
|
monkeypatch.setenv("VIDEO_UPLOAD_DIR", str(upload_dir))
|
||||||
|
monkeypatch.setenv("MAX_VIDEO_SIZE_MB", "10")
|
||||||
|
|
||||||
|
from app.routers.video import router
|
||||||
|
from app.core.config import get_settings
|
||||||
|
|
||||||
|
get_settings.cache_clear()
|
||||||
|
app = FastAPI()
|
||||||
|
app.include_router(router, prefix="/api/v1")
|
||||||
|
return TestClient(app), upload_dir
|
||||||
|
|
||||||
|
|
||||||
class TestVideoUpload:
|
class TestVideoUpload:
|
||||||
"""Video upload endpoint tests."""
|
def test_upload_mp4_success(self, video_client):
|
||||||
|
client, upload_dir = video_client
|
||||||
|
content = b"\x00" * 1024 * 100 # 100KB dummy data
|
||||||
|
resp = client.post(
|
||||||
|
"/api/v1/video/upload",
|
||||||
|
files={"file": ("test.mp4", content, "video/mp4")},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert "video_id" in data
|
||||||
|
assert data["filename"] == "test.mp4"
|
||||||
|
assert data["size_bytes"] == 102400
|
||||||
|
assert data["url"].startswith("/api/v1/video/")
|
||||||
|
assert len(list(upload_dir.glob("*"))) == 1
|
||||||
|
|
||||||
def test_upload_mp4_success(self):
|
def test_upload_size_limit(self, video_client):
|
||||||
"""Should accept valid MP4 under 300MB."""
|
client, upload_dir = video_client
|
||||||
pass # TODO: implement
|
content = b"\x00" * (11 * 1024 * 1024) # 11MB, limit is 10MB
|
||||||
|
resp = client.post(
|
||||||
|
"/api/v1/video/upload",
|
||||||
|
files={"file": ("big.mp4", content, "video/mp4")},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 413
|
||||||
|
assert len(list(upload_dir.glob("*"))) == 0
|
||||||
|
|
||||||
def test_upload_size_limit(self):
|
def test_upload_invalid_format(self, video_client):
|
||||||
"""Should reject files over 300MB."""
|
client, upload_dir = video_client
|
||||||
pass # TODO: implement
|
content = b"hello world"
|
||||||
|
resp = client.post(
|
||||||
|
"/api/v1/video/upload",
|
||||||
|
files={"file": ("doc.txt", content, "text/plain")},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 400
|
||||||
|
assert "Unsupported format" in resp.json()["detail"]
|
||||||
|
|
||||||
def test_upload_invalid_format(self):
|
def test_static_file_serving(self, video_client):
|
||||||
"""Should reject non-video formats."""
|
client, upload_dir = video_client
|
||||||
pass # TODO: implement
|
content = b"\x00" * 512
|
||||||
|
resp = client.post(
|
||||||
|
"/api/v1/video/upload",
|
||||||
|
files={"file": ("serve_test.mp4", content, "video/mp4")},
|
||||||
|
)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
video_id = resp.json()["video_id"]
|
||||||
|
|
||||||
def test_static_file_serving(self):
|
resp = client.get(f"/api/v1/video/{video_id}")
|
||||||
"""Should serve uploaded video via static URL."""
|
assert resp.status_code == 200
|
||||||
pass # TODO: implement
|
assert resp.headers["content-type"] == "video/mp4"
|
||||||
|
assert resp.content == content
|
||||||
|
|
||||||
|
def test_unknown_video_returns_404(self, video_client):
|
||||||
|
client, _ = video_client
|
||||||
|
resp = client.get("/api/v1/video/nonexistent")
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
|
||||||
|
|
@ -16,3 +16,6 @@ python-multipart>=0.0.6
|
||||||
reportlab>=4.2.5
|
reportlab>=4.2.5
|
||||||
langchain>=1.2.12,<1.3.0
|
langchain>=1.2.12,<1.3.0
|
||||||
langchain-openai>=1.1.11,<1.2.0
|
langchain-openai>=1.1.11,<1.2.0
|
||||||
|
dashscope>=0.4.0
|
||||||
|
aiofiles>=24.0.0
|
||||||
|
zhconv>=1.4.0
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue