feat: Phase 2.1 config + infrastructure and 2.2 video upload backend

- Add DashScope ASR and video upload config fields to Settings - Create Pydantic models (video.py, asr.py) - Create VideoService with validation, save, serve, delete - Create ASR client stub with float32_to_s16le utility - Implement POST /api/v1/video/upload with streaming validation - Implement GET /api/v1/video/{video_id} with FileResponse - Create WebSocket ASR endpoint stub - Register new routers in main.py - Update .env.example and requirements.txt - Add reference examples for DashScope integration - 8 tests passing (3 config + 5 video upload)
2026-05-06 13:08:19 +08:00 · 2026-05-06 13:08:19 +08:00 · 9934749d2b
parent 63e4c1a385
commit 9934749d2b
17 changed files with 1378 additions and 15 deletions
--- a/.examples/README.md
+++ b/.examples/README.md
@ -0,0 +1,49 @@
 # Alibaba Cloud DashScope ASR — Reference Examples
 Adapted from `/mnt/c/Users/woody/Documents/projects/voice input/` (Cantonese voice-to-text web app).
 ## Files
 | File | What | Language |
 |------|------|----------|
 | `alibaba_asr_backend.py` | FastAPI WebSocket proxy to DashScope realtime ASR | Python |
 | `alibaba_asr_frontend_vanilla.html` | Browser audio capture + WebSocket (vanilla JS) | HTML/JS |
 | `alibaba_asr_frontend_react.tsx` | React/TS hook + component for audio capture | TypeScript/React |
 ## Architecture
 ```
 Browser (Float32 PCM, 16kHz mono)
  │  WebSocket: send(float32Data.buffer)
  ▼
 FastAPI Backend (/ws/asr/{video_id})
  │  Convert Float32 → S16_LE → base64
  ▼
 Alibaba Cloud DashScope (wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime)
  │  Model: qwen3-asr-flash-realtime
  ▼  Language: yue (Cantonese)
 Transcript JSON → Browser
 ```
 ## Key Details
 - **Audio format**: Float32 PCM, 16kHz, mono (browser) → S16_LE PCM, 16kHz, mono, base64 (DashScope)
 - **Model**: `qwen3-asr-flash-realtime` (WebSocket realtime, unlimited duration)
 - **Endpoint**: `wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime`
 - **SDK**: `pip install dashscope>=0.4.0`
 - **Cantonese**: Language code `yue` (works natively with DashScope)
 - **VAD**: Server-side (Alibaba Cloud handles voice activity detection)
 - **Pricing**: ~$0.00009/second
 - **Features**: Punctuation, ITN, filler word filtering, multi-language auto-detect
 ## Dependencies
 ```
 # Python
 dashscope>=0.4.0
 openai>=1.52.0
 zhconv>=1.4.0       # Simplified → Traditional Chinese (optional)
 # No additional JS deps needed — native Web APIs only:
 #   WebSocket, AudioContext, ScriptProcessorNode, getUserMedia
 ```
--- a/.examples/alibaba_asr_backend.py
+++ b/.examples/alibaba_asr_backend.py
@ -0,0 +1,265 @@
 """
 Reference: Alibaba Cloud DashScope Real-Time ASR Proxy (FastAPI WebSocket).
 Extracted and adapted from:
  /mnt/c/Users/woody/Documents/projects/voice input/backend/app.py
 Architecture:
  Browser (Float32 PCM) → FastAPI WebSocket → DashScope Real-Time WebSocket
  API key NEVER leaves the server. Backend proxies audio to Alibaba Cloud.
 Key imports:
  pip install dashscope>=0.4.0 openai>=1.52.0 zhconv
 """
 import os
 import json
 import struct
 import asyncio
 import base64
 import logging
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 from dotenv import load_dotenv
 import zhconv  # Optional: simplified → traditional Chinese conversion
 from dashscope.audio.qwen_omni.omni_realtime import (
    OmniRealtimeConversation,
    OmniRealtimeCallback,
    TranscriptionParams,
    MultiModality,
 )
 load_dotenv()
 logger = logging.getLogger("asr-proxy")
 DASHSCOPE_API_KEY = os.getenv("DASHSCOPE_API_KEY")
 # ─── Audio Conversion: Float32 PCM → S16_LE ─────────────────────────────────
 def float32_to_s16le(float32_bytes: bytes) -> bytes:
    """Convert browser Float32 PCM to S16_LE required by DashScope.
    Browser: Float32Array (values -1.0 to 1.0) → raw bytes
    DashScope: S16_LE PCM 16kHz mono → base64 encoded
    """
    num_samples = len(float32_bytes) // 4
    floats = struct.unpack(f"<{num_samples}f", float32_bytes)
    int16_samples = [max(-32768, min(32767, int(f * 32767.0))) for f in floats]
    return struct.pack(f"<{num_samples}h", *int16_samples)
 def _to_traditional(text: str) -> str:
    """Convert Simplified Chinese to Traditional (for Cantonese display)."""
    if not text:
        return text
    return zhconv.convert(text, "zh-hant")
 def build_display_text(accumulated: str, current: str) -> str:
    """Assemble multi-utterance display text."""
    parts = [p for p in (accumulated, current) if p and p.strip()]
    return " ".join(parts)
 # ─── DashScope Callback Bridge (Sync → Async) ───────────────────────────────
 class DashScopeCallback(OmniRealtimeCallback):
    """Bridges sync DashScope SDK callbacks to async WebSocket messages.
    The DashScope SDK fires callbacks from a background thread.
    We push events to an asyncio.Queue so the async task can read them.
    """
    def __init__(self, event_queue: asyncio.Queue, loop: asyncio.AbstractEventLoop):
        super().__init__()
        self._queue = event_queue
        self._loop = loop
    def on_open(self):
        logger.info("DashScope realtime connection opened")
    def on_event(self, message):
        """Called from SDK background thread."""
        try:
            event = json.loads(message) if isinstance(message, str) else message
            self._loop.call_soon_threadsafe(self._queue.put_nowait, event)
        except Exception as e:
            logger.error(f"DashScope callback error: {e}")
    def on_close(self, code, msg):
        logger.info(f"DashScope realtime closed: code={code}, msg={msg}")
 # ─── WebSocket Proxy Handler ────────────────────────────────────────────────
 async def ws_proxy_dashscope(
    client_ws: WebSocket,
    loop: asyncio.AbstractEventLoop,
    language: str = "yue",  # "yue" for Cantonese, "zh" for Mandarin, "en" for English
 ):
    """Proxy browser audio to DashScope real-time ASR.
    Flow:
      1. Browser sends Float32 PCM bytes via WebSocket
      2. Backend converts to S16_LE → base64
      3. Append audio to DashScope conversation
      4. DashScope SDK sends events → callback → asyncio.Queue
      5. Read events from queue → format as JSON → send to browser
    Protocol (backend → browser):
      Partial:  {"delta": "", "full_text": "...", "language": "yue", "is_final": false}
      Final:    {"delta": "", "full_text": "...", "language": "yue", "is_final": true}
    """
    event_queue: asyncio.Queue = asyncio.Queue()
    callback = DashScopeCallback(event_queue, loop)
    # Initialize DashScope real-time conversation
    conversation = OmniRealtimeConversation(
        model="qwen3-asr-flash-realtime",
        url="wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime",
        callback=callback,
    )
    # connect() is synchronous — run in executor to avoid blocking
    await loop.run_in_executor(None, conversation.connect)
    logger.info("dashscope-session-connected")
    # Configure session: text output + audio transcription
    transcription_kwargs: dict = {
        "sample_rate": 16000,
        "input_audio_format": "pcm",
    }
    if language != "auto":
        transcription_kwargs["language"] = language
    transcription_params = TranscriptionParams(**transcription_kwargs)
    conversation.update_session(
        output_modalities=[MultiModality.TEXT],
        enable_input_audio_transcription=True,
        transcription_params=transcription_params,
    )
    logger.info("dashscope-session-updated lang=%s", language)
    accumulated_text = ""
    current_lang = language
    async def read_events():
        """Async task: read DashScope events from queue → send to browser."""
        nonlocal accumulated_text, current_lang
        while True:
            event = await event_queue.get()
            event_type = event.get("type", "")
            if event_type == "conversation.item.input_audio_transcription.text":
                # Partial result (in-progress utterance)
                stash = event.get("stash", "")
                display = (
                    build_display_text(accumulated_text, stash)
                    if stash else accumulated_text
                )
                await client_ws.send_json({
                    "delta": "",
                    "full_text": _to_traditional(display),
                    "language": event.get("language", current_lang),
                    "is_final": False,
                })
            elif event_type == "conversation.item.input_audio_transcription.completed":
                # Final utterance completed
                transcript = event.get("transcript", "")
                if transcript and transcript.strip():
                    accumulated_text = build_display_text(accumulated_text, transcript)
                current_lang = event.get("language", current_lang)
                logger.info(
                    "dashscope-utterance lang=%s text_len=%d accumulated_len=%d",
                    current_lang, len(transcript), len(accumulated_text),
                )
                await client_ws.send_json({
                    "delta": "",
                    "full_text": _to_traditional(accumulated_text),
                    "language": current_lang,
                    "is_final": True,
                })
    read_task = asyncio.create_task(read_events())
    try:
        # Main loop: receive Float32 PCM from browser → convert → send to DashScope
        while True:
            float32_bytes = await client_ws.receive_bytes()
            s16_bytes = float32_to_s16le(float32_bytes)
            audio_b64 = base64.b64encode(s16_bytes).decode("ascii")
            conversation.append_audio(audio_b64)
    except WebSocketDisconnect:
        pass
    finally:
        read_task.cancel()
        conversation.close()
        logger.info(
            "dashscope-session-closed text_len=%d",
            len(accumulated_text),
        )
 # ─── FastAPI App Setup ──────────────────────────────────────────────────────
 app = FastAPI()
@app.websocket("/ws/asr/{video_id}")
 async def ws_asr_endpoint(websocket: WebSocket, video_id: str, language: str = "yue"):
    """WebSocket endpoint for real-time ASR.
    Query params:
      language: "yue" (Cantonese), "zh" (Mandarin), "en" (English), "auto"
    """
    await websocket.accept()
    loop = asyncio.get_event_loop()
    logger.info("ws-connect video_id=%s lang=%s", video_id, language)
    await ws_proxy_dashscope(websocket, loop, language)
    logger.info("ws-disconnect video_id=%s", video_id)
 # ─── Non-Streaming Fallback (POST) ──────────────────────────────────────────
 from fastapi import UploadFile, File
 from openai import OpenAI
 # Sync client for non-streaming fallback
 sync_client = OpenAI(
    api_key=DASHSCOPE_API_KEY,
    base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1",
 )
@app.post("/api/v1/asr/transcribe")
 async def transcribe_file(file: UploadFile = File(...), language: str = "yue"):
    """Non-streaming fallback: transcribe an uploaded audio file."""
    audio = await file.read()
    # Build data URL: data:;base64,<base64_audio>
    audio_b64 = base64.b64encode(audio).decode()
    data_url = f"data:;base64,{audio_b64}"
    resp = sync_client.chat.completions.create(
        model="qwen3-asr-flash",
        messages=[{
            "role": "user",
            "content": [{
                "type": "input_audio",
                "input_audio": {"data": data_url},
            }],
        }],
        extra_body={
            "asr_options": {
                "language": language if language != "auto" else None,
            }
        },
    )
    result = resp.choices[0].message.content
    return {
        "text": _to_traditional(result),
        "language": language,
    }
--- a/.examples/alibaba_asr_frontend_react.tsx
+++ b/.examples/alibaba_asr_frontend_react.tsx
@ -0,0 +1,244 @@
 /**
 * Reference: React/TypeScript adaptation of VIDEO AUDIO capture for Alibaba Cloud ASR.
 *
 * Two modes:
 *   A. Streaming (real-time): capture <video> element's audio during playback
 *      → WebSocket → backend → DashScope realtime → transcript in QueryInput
 *   B. Full Transcript (batch): POST /api/v1/video/{id}/transcribe
 *      → backend extracts audio via ffmpeg → DashScope non-streaming → full transcript
 *
 * Audio pipeline (streaming mode):
 *   <video> element → AudioContext.createMediaElementSource(video)
 *   → ScriptProcessor(4096, 1, 1) → Float32Array
 *   → WebSocket.send(float32Data.buffer) → Backend → DashScope
 *
 * IMPORTANT:
 *   - processor.connect(audioContext.destination) so audio still plays through speakers
 *   - No getUserMedia() needed (no microphone permission)
 *   - Full Transcript mode uses backend ffmpeg to extract audio server-side
 */
 import { useState, useRef, useCallback, useEffect } from 'react';
 // ─── Types ──────────────────────────────────────────────────────────────────
 interface ASRMessage {
  delta: string;
  full_text: string;
  language: string;
  is_final: boolean;
 }
 type ASRStatus = 'idle' | 'connecting' | 'streaming' | 'disconnected' | 'error';
 interface UseVideoASROptions {
  videoId: string;
  videoElement: HTMLVideoElement | null;
  language?: string;  // "yue" | "zh" | "en" | "auto"
  onFinalTranscript?: (text: string) => void;
 }
 // ─── Hook: useVideoASR (Streaming Mode) ─────────────────────────────────────
 export function useVideoASR({
  videoId,
  videoElement,
  language = 'yue',
  onFinalTranscript,
 }: UseVideoASROptions) {
  const [transcript, setTranscript] = useState('');
  const [partialTranscript, setPartialTranscript] = useState('');
  const [status, setStatus] = useState<ASRStatus>('idle');
  const [isStreaming, setIsStreaming] = useState(false);
  const wsRef = useRef<WebSocket | null>(null);
  const audioContextRef = useRef<AudioContext | null>(null);
  const processorRef = useRef<ScriptProcessorNode | null>(null);
  const sourceRef = useRef<MediaElementAudioSourceNode | null>(null);
  const isStreamingRef = useRef(false);
  const getWSURL = useCallback(() => {
    const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
    const host = window.location.host;
    const langParam = language !== 'auto' ? `?language=${language}` : '';
    return `${protocol}//${host}/ws/asr/${videoId}${langParam}`;
  }, [videoId, language]);
  const startStreaming = useCallback(() => {
    if (!videoElement) {
      console.error('No video element available');
      return;
    }
    try {
      setStatus('connecting');
      const audioContext = new AudioContext({ sampleRate: 16000 });
      audioContextRef.current = audioContext;
      const source = audioContext.createMediaElementSource(videoElement);
      sourceRef.current = source;
      const processor = audioContext.createScriptProcessor(4096, 1, 1);
      processorRef.current = processor;
      const ws = new WebSocket(getWSURL());
      wsRef.current = ws;
      ws.onopen = () => {
        isStreamingRef.current = true;
        setIsStreaming(true);
        setStatus('streaming');
      };
      ws.onmessage = (e) => {
        const msg: ASRMessage = JSON.parse(e.data);
        setTranscript(msg.full_text);
        setPartialTranscript(msg.is_final ? '' : msg.full_text);
        if (msg.is_final && msg.full_text.trim()) {
          onFinalTranscript?.(msg.full_text);
        }
      };
      ws.onerror = () => setStatus('error');
      ws.onclose = () => {
        isStreamingRef.current = false;
        setIsStreaming(false);
        setStatus('disconnected');
      };
      processor.onaudioprocess = (e) => {
        if (!isStreamingRef.current) return;
        if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return;
        const float32Data = e.inputBuffer.getChannelData(0);
        wsRef.current.send(float32Data.buffer);
      };
      source.connect(processor);
      processor.connect(audioContext.destination);
    } catch (err) {
      console.error('Video audio capture failed:', err);
      setStatus('error');
    }
  }, [videoElement, getWSURL, onFinalTranscript]);
  const stopStreaming = useCallback(() => {
    isStreamingRef.current = false;
    setIsStreaming(false);
    processorRef.current?.disconnect();
    processorRef.current = null;
    sourceRef.current?.disconnect();
    sourceRef.current = null;
    wsRef.current?.close();
    wsRef.current = null;
    audioContextRef.current?.close();
    audioContextRef.current = null;
    setStatus('idle');
    setPartialTranscript('');
  }, []);
  useEffect(() => {
    return () => {
      isStreamingRef.current = false;
      processorRef.current?.disconnect();
      sourceRef.current?.disconnect();
      wsRef.current?.close();
      audioContextRef.current?.close();
    };
  }, []);
  useEffect(() => {
    if (!videoElement) return;
    const onPlay = () => startStreaming();
    const onPause = () => stopStreaming();
    const onEnded = () => stopStreaming();
    videoElement.addEventListener('play', onPlay);
    videoElement.addEventListener('pause', onPause);
    videoElement.addEventListener('ended', onEnded);
    return () => {
      videoElement.removeEventListener('play', onPlay);
      videoElement.removeEventListener('pause', onPause);
      videoElement.removeEventListener('ended', onEnded);
    };
  }, [videoElement, startStreaming, stopStreaming]);
  return {
    transcript,
    partialTranscript,
    status,
    isStreaming,
    startStreaming,
    stopStreaming,
  };
 }
 // ─── Hook: useFullTranscript (Batch Mode) ───────────────────────────────────
 interface UseFullTranscriptOptions {
  videoId: string;
 }
 export function useFullTranscript({ videoId }: UseFullTranscriptOptions) {
  const [fullTranscript, setFullTranscript] = useState('');
  const [isLoading, setIsLoading] = useState(false);
  const [error, setError] = useState<string | null>(null);
  const requestFullTranscript = useCallback(async () => {
    setIsLoading(true);
    setError(null);
    try {
      const resp = await fetch(`/api/v1/video/${videoId}/transcribe`, {
        method: 'POST',
      });
      if (!resp.ok) {
        throw new Error(`Server returned ${resp.status}`);
      }
      const data = await resp.json();
      setFullTranscript(data.text);
      return data.text;
    } catch (err) {
      const msg = err instanceof Error ? err.message : 'Transcription failed';
      setError(msg);
      return null;
    } finally {
      setIsLoading(false);
    }
  }, [videoId]);
  return { fullTranscript, isLoading, error, requestFullTranscript };
 }
 // ─── Usage Example (in LTTPage.tsx) ─────────────────────────────────────────
 //
 // const videoRef = useRef<HTMLVideoElement>(null);
 // const [currentVideoId, setCurrentVideoId] = useState<string | null>(null);
 //
 // // Streaming ASR (auto-starts on video play, stops on pause/end)
 // const asr = useVideoASR({
 //   videoId: currentVideoId ?? '',
 //   videoElement: videoRef.current,
 //   language: 'yue',
 //   onFinalTranscript: (text) => {
 //     setQueryText(text);  // into QueryInput
 //   },
 // });
 //
 // // Full Transcript (manual button)
 // const ft = useFullTranscript({ videoId: currentVideoId ?? '' });
 //
 // return (
 //   <>
 //     <video ref={videoRef} src={videoUrl} controls />
 //     <button onClick={ft.requestFullTranscript} disabled={ft.isLoading}>
 //       {ft.isLoading ? 'Transcribing...' : 'Full Transcript'}
 //     </button>
 //     <QueryInput
 //       value={queryText}
 //       onChange={setQueryText}
 //       onSubmit={handleQuerySubmit}
 //       partialText={asr.partialTranscript}
 //     />
 //   </>
 // );
--- a/.examples/alibaba_asr_frontend_vanilla.html
+++ b/.examples/alibaba_asr_frontend_vanilla.html
@ -0,0 +1,159 @@
 <!--
 Reference: Browser Audio Capture + WebSocket Streaming for Alibaba Cloud ASR.
 Extracted from: /mnt/c/Users/woody/Documents/projects/voice input/backend/static/index.html
 Architecture:
  Browser mic → AudioContext (16kHz mono) → ScriptProcessor → Float32Array
  → WebSocket.send(float32Data.buffer) → FastAPI → DashScope
 Key points:
  - Must use HTTPS/WSS (Chrome blocks getUserMedia on HTTP)
  - AudioContext sampleRate MUST be 16000
  - ScriptProcessor buffer size: 4096 (lower = more frequent sends, lower latency)
  - Send raw Float32 bytes (NOT base64, NOT WAV) to our backend WebSocket
  - Backend handles Float32 → S16_LE → base64 conversion
  - Language query param: ?language=yue (Cantonese), zh, en, auto
 -->
 <!DOCTYPE html>
 <html lang="zh-HK">
 <head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>ASR Reference - Audio Capture</title>
 </head>
 <body>
  <h1>Alibaba Cloud ASR - Audio Capture Pattern</h1>
  <!-- Language selector -->
  <select id="langSelect">
    <option value="auto">Auto-Detect</option>
    <option value="en">English</option>
    <option value="zh">Mandarin</option>
    <option value="yue" selected>Cantonese</option>
  </select>
  <!-- Record toggle button -->
  <button id="recordBtn">Start Recording</button>
  <!-- Status indicator -->
  <div id="status">Ready</div>
  <!-- Transcript display -->
  <div id="transcript"></div>
 <script>
 // ─── Configuration ──────────────────────────────────────────────────────────
 const WS_PATH = '/ws/asr/session-1';  // video_id from URL or session
 const WS_BASE = `${location.protocol === 'https:' ? 'wss:' : 'ws:'}//${location.host}${WS_PATH}`;
 // ─── State ──────────────────────────────────────────────────────────────────
 let ws = null;
 let audioContext = null;
 let processor = null;       // ScriptProcessorNode
 let stream = null;          // MediaStream
 let isRecording = false;
 // ─── DOM Refs ───────────────────────────────────────────────────────────────
 const recordBtn = document.getElementById('recordBtn');
 const langSelect = document.getElementById('langSelect');
 const statusEl = document.getElementById('status');
 const transcriptEl = document.getElementById('transcript');
 // ─── WebSocket URL Builder ──────────────────────────────────────────────────
 function getWSURL() {
  const lang = langSelect.value;
  return lang !== 'auto' ? `${WS_BASE}?language=${lang}` : WS_BASE;
 }
 // ─── Status Helper ──────────────────────────────────────────────────────────
 function setStatus(text) {
  statusEl.textContent = text;
 }
 // ─── Audio Capture Setup ────────────────────────────────────────────────────
 async function startRecording() {
  try {
    // Step 1: Get microphone access
    stream = await navigator.mediaDevices.getUserMedia({ audio: true });
    // Step 2: Create AudioContext at EXACTLY 16kHz (required by DashScope)
    audioContext = new AudioContext({ sampleRate: 16000 });
    // Step 3: Create media stream source
    const source = audioContext.createMediaStreamSource(stream);
    // Step 4: Create ScriptProcessor for raw PCM access
    // Buffer size 4096 → ~256ms chunks at 16kHz (lower = more responsive)
    processor = audioContext.createScriptProcessor(4096, 1, 1);
    // Step 5: Connect WebSocket
    ws = new WebSocket(getWSURL());
    ws.onopen = () => {
      isRecording = true;
      recordBtn.textContent = 'Stop Recording';
      setStatus('Listening...');
    };
    ws.onmessage = (e) => {
      const { full_text, language, is_final } = JSON.parse(e.data);
      // Display transcript — in our React app, this goes into QueryInput
      transcriptEl.textContent = full_text || '';
    };
    ws.onerror = () => setStatus('WebSocket error');
    ws.onclose = () => {
      isRecording = false;
      recordBtn.textContent = 'Start Recording';
      setStatus('Disconnected');
    };
    // Step 6: Audio processing callback — fires every ~256ms
    processor.onaudioprocess = (e) => {
      // Get raw Float32 samples from input channel 0
      const float32Data = e.inputBuffer.getChannelData(0);
      // Send raw Float32 bytes to backend (NOT base64, NOT WAV)
      // The .buffer property gives us an ArrayBuffer of the Float32Array
      if (ws && ws.readyState === WebSocket.OPEN && isRecording) {
        ws.send(float32Data.buffer);
      }
    };
    // Step 7: Connect the audio graph
    source.connect(processor);
    processor.connect(audioContext.destination);  // Required: prevents auto-mute
  } catch (err) {
    setStatus('Microphone access denied');
    console.error('Audio capture error:', err);
  }
 }
 // ─── Stop Recording ─────────────────────────────────────────────────────────
 function stopRecording() {
  isRecording = false;
  // Clean up audio graph
  processor?.disconnect();
  stream?.getTracks().forEach(t => t.stop());
  // Close WebSocket
  ws?.close();
  recordBtn.textContent = 'Start Recording';
  setStatus('Ready');
 }
 // ─── Toggle Recording ───────────────────────────────────────────────────────
 recordBtn.addEventListener('click', () => {
  if (isRecording) {
    stopRecording();
  } else {
    startRecording();
  }
 });
 </script>
 </body>
 </html>
--- a/.plans/phase2_implementation_plan.md
+++ b/.plans/phase2_implementation_plan.md
@ -0,0 +1,321 @@
 # Phase 2: Video Upload + Video Audio ASR → RAG — Implementation Plan
 **Created:** 2026-05-06
 **Updated:** 2026-05-06 (video audio capture via createMediaElementSource; Full Transcript batch mode)
 **Status:** Planning — Not Started
 **Depends on:** Phase 1 (Complete)
 ---
 ## 1. Overview
 Phase 2 adds video upload/playback and ASR transcription of the **video's audio track** (not microphone). When the video plays, browser captures the video audio output and streams it to Alibaba Cloud DashScope for real-time transcription. A "Full Transcript" button sends the complete video audio for batch (non-streaming) transcription via backend ffmpeg extraction.
 ### Two ASR Modes
 **Mode A — Streaming (real-time, auto on play):**
 ```
 <video> → AudioContext.createMediaElementSource(video)
  → ScriptProcessor(4096, 1, 1) → Float32 PCM
  → WebSocket → FastAPI → DashScope realtime API
  → transcript JSON → QueryInput (in real time)
 ```
 Auto-starts when video plays, stops on pause/seek/end. Partial transcript flows into QueryInput.
 **Mode B — Full Transcript (batch, manual button):**
 ```
 User clicks "Full Transcript" under video player
  → POST /api/v1/video/{id}/transcribe
  → Backend: ffmpeg extracts audio from uploaded video
  → DashScope OpenAI-compatible API (non-streaming)
  → Complete transcript of entire video → QueryInput
 ```
 Server-side audio extraction via ffmpeg. No browser involvement.
 ### Changes From Previous Versions
 | Aspect | V2 (mic capture) | V3 (video audio capture) |
 |---|---|---|
 | Audio source | getUserMedia() microphone | createMediaElementSource(video) |
 | Trigger | Manual record button | Auto on video play, stop on pause/end |
 | Permissions | Microphone permission required | None |
 | Batch mode | No | Yes — "Full Transcript" button |
 | Backend ffmpeg | Not needed | For Full Transcript mode |
 ---
 ## 2. User Flow
 1. User uploads video → appears in left panel player
 2. User presses play → browser captures video audio → streams to DashScope → transcript in QueryInput
 3. User pauses/seeks/ends → streaming stops, accumulated transcript stays in QueryInput
 4. User edits transcript in QueryInput and clicks Submit → Phase 1 RAG pipeline
 5. **Full Transcript**: clicks "Full Transcript" button → server extracts audio → batch ASR → complete transcript fills QueryInput
 ---
 ## 3. Sub-Phases
 ### Phase 2.1 — Configuration & Infrastructure Setup (0.5 day)
 Config fields, directory structure, service/router/model skeletons, register routers.
 **Test:** `test_phase2_config.py`
 **Tasks:**
 | # | Task | File |
 |---|------|------|
 | 2.1.1 | Add 6 config fields: `dashscope_api_key`, `asr_model_name`, `asr_realtime_model_name`, `video_upload_dir`, `max_video_size_mb`, `supported_video_formats` | `core/config.py` |
 | 2.1.2 | Update `.env.example` | `.env.example` |
 | 2.1.3 | Add deps: `dashscope>=0.4.0`, `openai>=1.52.0`, `zhconv>=1.4.0`, `python-multipart`, `aiofiles` | `requirements.txt` |
 | 2.1.4 | Create `models/video.py` — `VideoUploadResponse`, `FullTranscriptResponse` | `models/video.py` |
 | 2.1.5 | Create `models/asr.py` — `ASRTranscriptEvent` | `models/asr.py` |
 | 2.1.6 | Create `services/video_service.py`, `services/asr_client.py` stubs | `services/` |
 | 2.1.7 | Create `routers/video.py` stub: `POST /upload`, `GET /{id}`, `POST /{id}/transcribe` | `routers/video.py` |
 | 2.1.8 | Create `routers/ws_asr.py` stub: `WS /ws/asr/{video_id}?language=yue` | `routers/ws_asr.py` |
 | 2.1.9 | Register routers in `main.py` | `main.py` |
 | 2.1.10 | Write and pass `test_phase2_config.py` | `app/test/` |
 ---
 ### Phase 2.2 — Video Upload Backend (0.5 day)
 Streaming upload with size/format validation. Reuses `routers/ingest.py` pattern.
 **Test:** `test_phase2_video_upload.py` (implement 4 existing stubs)
 **Tasks:**
 | # | Task | File |
 |---|------|------|
 | 2.2.1 | Write tests — implement all 4 stubs | `test_phase2_video_upload.py` |
 | 2.2.2 | Implement `VideoService.validate_video()`, `save_video()` (streaming, aiofiles) | `services/video_service.py` |
 | 2.2.3 | Implement `VideoService.get_video_path()`, `delete_video()` | `services/video_service.py` |
 | 2.2.4 | Implement `POST /api/v1/video/upload` route | `routers/video.py` |
 | 2.2.5 | Implement `GET /api/v1/video/{video_id}` route (FileResponse) | `routers/video.py` |
 | 2.2.6 | Run tests → pass → commit | — |
 ---
 ### Phase 2.3 — ASR WebSocket Proxy + Full Transcript Backend (1 day)
 Two backend ASR paths: real-time streaming (WebSocket proxy to DashScope) and batch (ffmpeg extract → DashScope non-streaming API).
 **Reference:** `.examples/alibaba_asr_backend.py`
 **Tests:** `test_phase2_asr_client.py` (3 stubs), `test_phase2_ws_asr.py` (3 stubs), `test_phase2_ws_protocol.py` (new), `test_phase2_full_transcript.py` (new)
 **Acceptance Criteria:**
 - WebSocket `/ws/asr/{video_id}?language=yue` → Float32 PCM → S16_LE base64 → DashScope realtime
 - `transcription.text` events → `{"full_text": "...", "is_final": false}` to browser
 - `transcription.completed` events → `{"full_text": "...", "is_final": true}` to browser
 - Language: `yue` (Cantonese), `zh`, `en`, `auto`
 - Traditional Chinese via `zhconv`
 - `POST /api/v1/video/{video_id}/transcribe` → ffmpeg extract audio → DashScope batch → `{"text": "..."}`
 - `DASHSCOPE_API_KEY` not set → clear error
 - Client disconnect → DashScope session closed cleanly
 **Tasks:**
 | # | Task | File |
 |---|------|------|
 | 2.3.1 | Write tests first | `app/test/` |
 | 2.3.2 | `float32_to_s16le()`, `build_display_text()`, `_to_traditional()` | `services/asr_client.py` |
 | 2.3.3 | `DashScopeCallback` (sync SDK → asyncio.Queue bridge) + `_ws_proxy_dashscope()` | `routers/ws_asr.py` |
 | 2.3.4 | WebSocket endpoint | `routers/ws_asr.py` |
 | 2.3.5 | `VideoService.extract_audio()` — ffmpeg async subprocess: PCM16LE 16kHz mono | `services/video_service.py` |
 | 2.3.6 | `ASRClient.transcribe_full()` — batch: WAV → DashScope OpenAI-compatible API | `services/asr_client.py` |
 | 2.3.7 | `POST /api/v1/video/{video_id}/transcribe` route | `routers/video.py` |
 | 2.3.8 | Enhance `conftest.py` mock_asr_client | `conftest.py` |
 | 2.3.9 | Run tests → pass → commit | — |
 ---
 ### Phase 2.4 — Transcript → QueryInput + Full Transcript Button (0.5 day)
 Wire up real-time transcript from streaming ASR into QueryInput. Full Transcript button wiring.
 **Tests:** `test_phase2_useVideoASR.test.ts`, `test_phase2_useFullTranscript.test.ts`, `test_phase2_QueryInput_integration.test.tsx`
 **Acceptance Criteria:**
 - `useVideoASR` hook: auto-starts on video `play`, stops on `pause`/`ended`
 - `useVideoASR` exposes `transcript`, `partialTranscript`, `isStreaming`, `status`
 - `useFullTranscript` hook: `requestFullTranscript()` → loading → transcript → error
 - QueryInput shows transcript (grey italic = partial, black = final)
 - QueryInput accepts `partialText` prop
 **Tasks:**
 | # | Task | File |
 |---|------|------|
 | 2.4.1 | Write tests first | `src/test/` |
 | 2.4.2 | Create `hooks/useVideoASR.ts` (see `.examples/alibaba_asr_frontend_react.tsx`) | `hooks/useVideoASR.ts` |
 | 2.4.3 | Create `hooks/useFullTranscript.ts` | `hooks/useFullTranscript.ts` |
 | 2.4.4 | Update `types/index.ts` — `ASRMessage`, `ASRStatus`, `FullTranscriptResponse` | `types/index.ts` |
 | 2.4.5 | Update `QueryInput.tsx` — add `partialText` prop | `components/QueryInput.tsx` |
 | 2.4.6 | Run tests → pass → commit | — |
 ---
 ### Phase 2.5 — Frontend: Video Player + Buttons + Layout (1.5 days)
 Replace `VideoPlaceholder` with video upload + player. ASR auto on play. Full Transcript button.
 ```
 ┌─────────────────────┬──────────────────────────┐
 │ VideoUpload /       │ QueryInput               │  ← Upper Panel (30%)
 │ VideoPlayer         │   (transcript flows here  │
 │                     │    from video audio ASR)  │
 │ [Full Transcript]   │   [Submit]                │
 ├─────────────────────┴──────────────────────────┤
 │ ResponsePanel                                   │  ← Lower Panel (70%)
 └────────────────────────────────────────────────┘
 ```
 **Tests:** `test_phase2_VideoUpload.test.tsx`, `test_phase2_VideoPlayer.test.tsx`, `test_phase2_LTTPage_integration.test.tsx`
 **Acceptance Criteria:**
 - Drag-and-drop video upload with progress bar (native HTML5)
 - Video player with native `<video controls>` exposing ref
 - ASR auto on play → transcript in QueryInput; stops on pause/end
 - "Full Transcript" button → loading spinner → fills QueryInput with full transcript
 - Error states: upload fails, ASR fails, Full Transcript fails → clear messages
 **Tasks:**
 | # | Task | File |
 |---|------|------|
 | 2.5.1 | Write all 3 tests first | `src/test/` |
 | 2.5.2 | Create `VideoUpload.tsx` — native drag-drop, axios progress | `components/VideoUpload.tsx` |
 | 2.5.3 | Create `VideoPlayer.tsx` — native `<video controls>`, forwardRef | `components/VideoPlayer.tsx` |
 | 2.5.4 | Update `types/index.ts` | `types/index.ts` |
 | 2.5.5 | Update `lib/api.ts` — `uploadVideo()`, `getVideoUrl()`, `requestFullTranscript()` | `lib/api.ts` |
 | 2.5.6 | Update `lib/queries.tsx` — `useVideoUpload()` | `lib/queries.tsx` |
 | 2.5.7 | Refactor `LTTPage.tsx` — replace VideoPlaceholder, wire hooks + QueryInput | `pages/LTTPage.tsx` |
 | 2.5.8 | Update `QueryInput.tsx` — transcript value + partial text styling | `components/QueryInput.tsx` |
 | 2.5.9 | Run tests → pass → commit | — |
 ---
 ### Phase 2.6 — Integration & Acceptance Testing (1 day)
 **Tests:** `test_integration_phase2.py`, `test_acceptance_phase2_video.py`, `test_acceptance_phase2_asr.py`, `test_acceptance_integration_phase2.py`
 **Tasks:**
 | # | Task |
 |---|------|
 | 2.6.1 | Implement integration test (mocked DashScope, real ChromaDB + file I/O) |
 | 2.6.2 | Implement acceptance: real video upload + Full Transcript |
 | 2.6.3 | Implement acceptance: real DashScope streaming + batch |
 | 2.6.4 | Implement E2E acceptance |
 | 2.6.5 | Full regression run |
 | 2.6.6 | Fix failures, final commit |
 ---
 ### Phase 2.7 — Polish & Deployment (0.5 day)
 | # | Task |
 |---|------|
 | 2.7.1 | Structured logging for DashScope proxy + full transcript events |
 | 2.7.2 | Update `nginx.conf` — `client_max_body_size` 350M |
 | 2.7.3 | Verify production build |
 | 2.7.4 | Update `README.md` |
 | 2.7.5 | Final commit |
 ---
 ## 4. Timeline
 | Sub-Phase | Description | Effort | Depends On |
 |---|---|---|---|
 | 2.1 | Config & Infrastructure | 0.5 day | — |
 | 2.2 | Video Upload Backend | 0.5 day | 2.1 |
 | 2.3 | ASR Proxy + Full Transcript | 1 day | 2.1 |
 | 2.4 | Transcript → QueryInput | 0.5 day | 2.3 |
 | 2.5 | Frontend: Layout + Buttons | 1.5 days | 2.2, 2.3 |
 | 2.6 | Integration & Acceptance | 1 day | 2.4, 2.5 |
 | 2.7 | Polish & Deployment | 0.5 day | 2.6 |
 | **Total** | | **5.5 days** | |
 2.2 (upload) and 2.3 (ASR) run concurrently.
 ---
 ## 5. Dependencies
 **Backend:** `dashscope>=0.4.0`, `openai>=1.52.0`, `zhconv>=1.4.0`, `python-multipart`, `aiofiles`
 **Frontend:** None (native Web APIs: `AudioContext.createMediaElementSource`, `ScriptProcessorNode`, `<video>`, HTML5 drag-and-drop)
 **System:** ffmpeg on server (for Full Transcript audio extraction)
 ---
 ## 6. Config Fields
 ```python
 dashscope_api_key: str = ""
 asr_model_name: str = "qwen3-asr-flash"                  # Batch API
 asr_realtime_model_name: str = "qwen3-asr-flash-realtime" # Streaming
 video_upload_dir: str = "./uploads"
 max_video_size_mb: int = 300
 supported_video_formats: list[str] = [".mp4", ".webm", ".mov", ".avi", ".mkv"]
 ```
 ---
 ## 7. Key Design Decisions
 | Decision | Choice | Why |
 |---|---|---|
 | Audio source | `createMediaElementSource(video)` | Captures video audio during playback. No mic permission. |
 | ASR auto trigger | Video `play` event | Transcript appears as user watches. Natural UX. |
 | ASR stop trigger | Video `pause`/`ended` events | Clean lifecycle. New session on next play/seek. |
 | Full Transcript | Manual button + server ffmpeg | User explicitly requests. Server has the file. |
 | Full Transcript ASR | DashScope OpenAI-compatible API | Standard `/v1/chat/completions` with `input_audio`. WAV format. |
 | ASR streaming | DashScope realtime SDK | `OmniRealtimeConversation` + callback → asyncio.Queue bridge |
 | Transcript display | QueryInput textarea | Editable. Same box for typing or ASR output. |
 | SSL | Required | Chrome may block `createMediaElementSource` without secure context. |
 ---
 ## 8. File Manifest
 ### New Files
 ```
 backend/
  app/routers/video.py
  app/routers/ws_asr.py
  app/services/video_service.py
  app/services/asr_client.py
  app/models/video.py
  app/models/asr.py
  app/test/test_phase2_config.py
  app/test/test_phase2_ws_protocol.py
  app/test/test_phase2_full_transcript.py
  app/test/test_phase2_transcript_to_rag.py
 frontend/src/
  components/VideoUpload.tsx
  components/VideoPlayer.tsx
  hooks/useVideoASR.ts
  hooks/useFullTranscript.ts
  test/test_phase2_VideoUpload.test.tsx
  test/test_phase2_VideoPlayer.test.tsx
  test/test_phase2_useVideoASR.test.ts
  test/test_phase2_useFullTranscript.test.ts
  test/test_phase2_QueryInput_integration.test.tsx
  test/test_phase2_LTTPage_integration.test.tsx
 ```
 ### Modified Files
 ```
 backend/app/core/config.py, main.py, test/conftest.py, .env.example, requirements.txt
 frontend/src/pages/LTTPage.tsx, components/QueryInput.tsx, lib/api.ts, lib/queries.tsx, types/index.ts
 ```
 ---
 ## 9. Reference Code (`.examples/`)
 | File | Content |
 |---|---|
 | `alibaba_asr_backend.py` | DashScope WebSocket proxy + non-streaming fallback (FastAPI) |
 | `alibaba_asr_frontend_vanilla.html` | Browser audio capture (vanilla JS, original) |
 | `alibaba_asr_frontend_react.tsx` | React/TS: `useVideoASR` (streaming) + `useFullTranscript` (batch) hooks |
 | `README.md` | Architecture overview + dependency notes |
--- a/backend/.env.example
+++ b/backend/.env.example
@ -26,3 +26,13 @@ PROMPTS_DB_PATH=./data/prompts.db
 HISTORY_DB_PATH=./data/history.db
 CORS_ORIGINS=["http://localhost:5173","http://localhost:3000"]
 # Alibaba Cloud DashScope ASR (Phase 2)
 # Get your key from: https://modelstudio.console.alibabacloud.com
 DASHSCOPE_API_KEY=sk-your-dashscope-key-here
 ASR_MODEL_NAME=qwen3-asr-flash
 ASR_REALTIME_MODEL_NAME=qwen3-asr-flash-realtime
 # Video upload (Phase 2)
 VIDEO_UPLOAD_DIR=./uploads
 MAX_VIDEO_SIZE_MB=300
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@ -44,6 +44,16 @@ class Settings(BaseSettings):
    relevance_threshold: float = 7.0
    llm_timeout: float = 60.0
    # Alibaba Cloud DashScope ASR (Phase 2)
    dashscope_api_key: str = ""
    asr_model_name: str = "qwen3-asr-flash"
    asr_realtime_model_name: str = "qwen3-asr-flash-realtime"
    # Video upload (Phase 2)
    video_upload_dir: str = "./uploads"
    max_video_size_mb: int = 300
    supported_video_formats: list[str] = [".mp4", ".webm", ".mov", ".avi", ".mkv"]
    # Development helpers
    model_config = {"env_file": ".env", "env_file_encoding": "utf-8"}
--- a/backend/app/main.py
+++ b/backend/app/main.py
@ -7,7 +7,7 @@ from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse
-from app.routers import ingest, query, documents, prompts, history, chunks
+from app.routers import ingest, query, documents, prompts, history, chunks, video, ws_asr
 from app.core.config import get_settings
 from app.core.sqlite_db import (
    get_prompts_db,
@ -56,6 +56,8 @@ app.include_router(documents.router, prefix="/api/v1")
 app.include_router(prompts.router)
 app.include_router(history.router)
 app.include_router(chunks.router)
 app.include_router(video.router, prefix="/api/v1")
 app.include_router(ws_asr.router)
 _prompts_conn = get_prompts_db()
 init_prompts_db(_prompts_conn)
--- a/backend/app/models/asr.py
+++ b/backend/app/models/asr.py
@ -0,0 +1,8 @@
 from pydantic import BaseModel
 class ASRTranscriptEvent(BaseModel):
    delta: str = ""
    full_text: str
    language: str
    is_final: bool
--- a/backend/app/models/video.py
+++ b/backend/app/models/video.py
@ -0,0 +1,21 @@
 from pydantic import BaseModel
 class VideoUploadResponse(BaseModel):
    video_id: str
    filename: str
    size_bytes: int
    url: str
 class VideoInfo(BaseModel):
    video_id: str
    filename: str
    size_bytes: int
    upload_date: str
 class FullTranscriptResponse(BaseModel):
    text: str
    language: str
    duration_seconds: float | None = None
--- a/backend/app/routers/video.py
+++ b/backend/app/routers/video.py
@ -0,0 +1,77 @@
 import logging
 import uuid
 from pathlib import Path
 import aiofiles
 from fastapi import APIRouter, UploadFile, File, HTTPException
 from fastapi.responses import FileResponse
 from app.models.video import VideoUploadResponse
 from app.services.video_service import VideoService
 logger = logging.getLogger(__name__)
 router = APIRouter(tags=["video"])
 def _get_video_service() -> VideoService:
    from app.core.config import get_settings
    s = get_settings()
    return VideoService(
        upload_dir=s.video_upload_dir,
        max_size_mb=s.max_video_size_mb,
        supported_formats=s.supported_video_formats,
    )
@router.post("/video/upload", response_model=VideoUploadResponse)
 async def upload_video(file: UploadFile = File(...)):
    service = _get_video_service()
    filename = file.filename or "unknown"
    ext = Path(filename).suffix.lower()
    total_size = 0
    video_id = uuid.uuid4().hex[:12]
    dest_path = service.upload_dir / f"{video_id}{ext}"
    try:
        async with aiofiles.open(dest_path, "wb") as out:
            while chunk := await file.read(1024 * 1024):
                total_size += len(chunk)
                if total_size > service.max_size_bytes:
                    raise HTTPException(
                        status_code=413,
                        detail=f"File exceeds {service.max_size_bytes // 1024 // 1024}MB limit",
                    )
                await out.write(chunk)
    except HTTPException:
        dest_path.unlink(missing_ok=True)
        raise
    except Exception:
        dest_path.unlink(missing_ok=True)
        raise HTTPException(status_code=500, detail="Upload failed")
    service.validate_video(filename, file.content_type, total_size)
    logger.info("Video uploaded: id=%s filename=%s size=%d", video_id, filename, total_size)
    return VideoUploadResponse(
        video_id=video_id,
        filename=filename,
        size_bytes=total_size,
        url=f"/api/v1/video/{video_id}",
    )
@router.get("/video/{video_id}")
 async def serve_video(video_id: str):
    service = _get_video_service()
    video_path = service.get_video_path(video_id)
    ext = video_path.suffix.lower()
    media_types = {
        ".mp4": "video/mp4",
        ".webm": "video/webm",
        ".mov": "video/quicktime",
        ".avi": "video/x-msvideo",
        ".mkv": "video/x-matroska",
    }
    return FileResponse(str(video_path), media_type=media_types.get(ext, "application/octet-stream"))
--- a/backend/app/routers/ws_asr.py
+++ b/backend/app/routers/ws_asr.py
@ -0,0 +1,16 @@
 import logging
 from fastapi import APIRouter, WebSocket, WebSocketDisconnect
 logger = logging.getLogger(__name__)
 router = APIRouter(tags=["asr"])
@router.websocket("/ws/asr/{video_id}")
 async def ws_asr_endpoint(websocket: WebSocket, video_id: str, language: str = "yue"):
    await websocket.accept()
    try:
        while True:
            await websocket.receive_bytes()
    except WebSocketDisconnect:
        pass
--- a/backend/app/services/asr_client.py
+++ b/backend/app/services/asr_client.py
@ -0,0 +1,25 @@
 import struct
 import base64
 import logging
 logger = logging.getLogger(__name__)
 def float32_to_s16le(float32_bytes: bytes) -> bytes:
    num_samples = len(float32_bytes) // 4
    floats = struct.unpack(f"<{num_samples}f", float32_bytes)
    int16_samples = [max(-32768, min(32767, int(f * 32767.0))) for f in floats]
    return struct.pack(f"<{num_samples}h", *int16_samples)
 def build_display_text(accumulated: str, current: str) -> str:
    parts = [p for p in (accumulated, current) if p and p.strip()]
    return " ".join(parts)
 class ASRClient:
    def __init__(self, settings):
        self.settings = settings
    async def transcribe_full(self, audio_bytes: bytes, language: str = "yue") -> str:
        raise NotImplementedError
--- a/backend/app/services/video_service.py
+++ b/backend/app/services/video_service.py
@ -0,0 +1,35 @@
 from pathlib import Path
 from fastapi import HTTPException
 class VideoService:
    def __init__(self, upload_dir: str, max_size_mb: int, supported_formats: list[str]):
        self.upload_dir = Path(upload_dir)
        self.max_size_bytes = max_size_mb * 1024 * 1024
        self.supported_formats = supported_formats
        self.upload_dir.mkdir(parents=True, exist_ok=True)
    def validate_video(self, filename: str | None, content_type: str | None, size_bytes: int) -> None:
        if not filename:
            raise HTTPException(status_code=400, detail="No file selected")
        ext = Path(filename).suffix.lower()
        if ext not in self.supported_formats:
            raise HTTPException(
                status_code=400,
                detail=f"Unsupported format: {ext}. Supported: {', '.join(self.supported_formats)}",
            )
        if size_bytes > self.max_size_bytes:
            raise HTTPException(
                status_code=413,
                detail=f"File exceeds {self.max_size_bytes // 1024 // 1024}MB limit",
            )
    def get_video_path(self, video_id: str) -> Path:
        candidates = list(self.upload_dir.glob(f"{video_id}.*"))
        if not candidates:
            raise HTTPException(status_code=404, detail="Video not found")
        return candidates[0]
    def delete_video(self, video_id: str) -> None:
        for p in self.upload_dir.glob(f"{video_id}.*"):
            p.unlink()
--- a/backend/app/test/test_phase2_config.py
+++ b/backend/app/test/test_phase2_config.py
@ -0,0 +1,63 @@
 """Phase 2 config tests: ASR and video settings."""
 import pytest
 def test_config_asr_defaults(monkeypatch, tmp_path):
    monkeypatch.delenv("DASHSCOPE_API_KEY", raising=False)
    monkeypatch.delenv("ASR_MODEL_NAME", raising=False)
    monkeypatch.delenv("ASR_REALTIME_MODEL_NAME", raising=False)
    monkeypatch.setenv("LLM_API_KEY", "sk-test")
    monkeypatch.setenv("DP_API_KEY", "sk-test")
    monkeypatch.setenv("EMBEDDING_API_KEY", "sk-test")
    env_file = tmp_path / ".env"
    env_file.write_text("")
    monkeypatch.chdir(tmp_path)
    from app.core.config import Settings, get_settings
    get_settings.cache_clear()
    settings = Settings(_env_file=())
    assert settings.dashscope_api_key == ""
    assert settings.asr_model_name == "qwen3-asr-flash"
    assert settings.asr_realtime_model_name == "qwen3-asr-flash-realtime"
 def test_config_video_defaults(monkeypatch, tmp_path):
    monkeypatch.delenv("VIDEO_UPLOAD_DIR", raising=False)
    monkeypatch.delenv("MAX_VIDEO_SIZE_MB", raising=False)
    monkeypatch.setenv("LLM_API_KEY", "sk-test")
    monkeypatch.setenv("DP_API_KEY", "sk-test")
    monkeypatch.setenv("EMBEDDING_API_KEY", "sk-test")
    env_file = tmp_path / ".env"
    env_file.write_text("")
    monkeypatch.chdir(tmp_path)
    from app.core.config import Settings, get_settings
    get_settings.cache_clear()
    settings = Settings(_env_file=())
    assert settings.video_upload_dir == "./uploads"
    assert settings.max_video_size_mb == 300
    assert ".mp4" in settings.supported_video_formats
 def test_config_loads_from_env(tmp_path, monkeypatch):
    env_file = tmp_path / ".env"
    env_file.write_text(
        "DASHSCOPE_API_KEY=sk-test-key\n"
        "ASR_MODEL_NAME=qwen3-asr-flash\n"
        "ASR_REALTIME_MODEL_NAME=qwen3-asr-flash-realtime\n"
        "VIDEO_UPLOAD_DIR=./test_uploads\n"
        "MAX_VIDEO_SIZE_MB=500\n"
        "LLM_API_KEY=sk-test\n"
        "DP_API_KEY=sk-test\n"
        "EMBEDDING_API_KEY=sk-test\n"
    )
    monkeypatch.chdir(tmp_path)
    from app.core.config import Settings, get_settings
    get_settings.cache_clear()
    settings = Settings()
    assert settings.dashscope_api_key == "sk-test-key"
    assert settings.asr_model_name == "qwen3-asr-flash"
    assert settings.asr_realtime_model_name == "qwen3-asr-flash-realtime"
    assert settings.video_upload_dir == "./test_uploads"
    assert settings.max_video_size_mb == 500
--- a/backend/app/test/test_phase2_video_upload.py
+++ b/backend/app/test/test_phase2_video_upload.py
@ -1,29 +1,84 @@
 """Phase 2 tests: Video upload endpoint.
 Covers:
- POST /api/v1/upload-video with size validation (<300MB)
+- POST /api/v1/video/upload with size validation (<300MB)
 - Format validation (MP4 and common formats)
 - Static file serving
 - Error handling for oversized/invalid files
 """
 import pytest
 from fastapi import FastAPI
 from fastapi.testclient import TestClient
@pytest.fixture
 def video_client(tmp_path, monkeypatch):
    upload_dir = tmp_path / "test_uploads"
    upload_dir.mkdir()
    monkeypatch.setenv("VIDEO_UPLOAD_DIR", str(upload_dir))
    monkeypatch.setenv("MAX_VIDEO_SIZE_MB", "10")
    from app.routers.video import router
    from app.core.config import get_settings
    get_settings.cache_clear()
    app = FastAPI()
    app.include_router(router, prefix="/api/v1")
    return TestClient(app), upload_dir
 class TestVideoUpload:
-    """Video upload endpoint tests."""
+    def test_upload_mp4_success(self, video_client):
        client, upload_dir = video_client
        content = b"\x00" * 1024 * 100  # 100KB dummy data
        resp = client.post(
            "/api/v1/video/upload",
            files={"file": ("test.mp4", content, "video/mp4")},
        )
        assert resp.status_code == 200
        data = resp.json()
        assert "video_id" in data
        assert data["filename"] == "test.mp4"
        assert data["size_bytes"] == 102400
        assert data["url"].startswith("/api/v1/video/")
        assert len(list(upload_dir.glob("*"))) == 1
-    def test_upload_mp4_success(self):
+    def test_upload_size_limit(self, video_client):
-        """Should accept valid MP4 under 300MB."""
+        client, upload_dir = video_client
-        pass  # TODO: implement
+        content = b"\x00" * (11 * 1024 * 1024)  # 11MB, limit is 10MB
        resp = client.post(
            "/api/v1/video/upload",
            files={"file": ("big.mp4", content, "video/mp4")},
        )
        assert resp.status_code == 413
        assert len(list(upload_dir.glob("*"))) == 0
-    def test_upload_size_limit(self):
+    def test_upload_invalid_format(self, video_client):
-        """Should reject files over 300MB."""
+        client, upload_dir = video_client
-        pass  # TODO: implement
+        content = b"hello world"
        resp = client.post(
            "/api/v1/video/upload",
            files={"file": ("doc.txt", content, "text/plain")},
        )
        assert resp.status_code == 400
        assert "Unsupported format" in resp.json()["detail"]
-    def test_upload_invalid_format(self):
+    def test_static_file_serving(self, video_client):
-        """Should reject non-video formats."""
+        client, upload_dir = video_client
-        pass  # TODO: implement
+        content = b"\x00" * 512
        resp = client.post(
            "/api/v1/video/upload",
            files={"file": ("serve_test.mp4", content, "video/mp4")},
        )
        assert resp.status_code == 200
        video_id = resp.json()["video_id"]
-    def test_static_file_serving(self):
+        resp = client.get(f"/api/v1/video/{video_id}")
-        """Should serve uploaded video via static URL."""
+        assert resp.status_code == 200
-        pass  # TODO: implement
+        assert resp.headers["content-type"] == "video/mp4"
        assert resp.content == content
    def test_unknown_video_returns_404(self, video_client):
        client, _ = video_client
        resp = client.get("/api/v1/video/nonexistent")
        assert resp.status_code == 404
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@ -16,3 +16,6 @@ python-multipart>=0.0.6
 reportlab>=4.2.5
 langchain>=1.2.12,<1.3.0
 langchain-openai>=1.1.11,<1.2.0
 dashscope>=0.4.0
 aiofiles>=24.0.0
 zhconv>=1.4.0