legco_ai_assistant/.examples/alibaba_asr_frontend_react.tsx

/**
 * Reference: React/TypeScript adaptation of VIDEO AUDIO capture for Alibaba Cloud ASR.
 *
 * Two modes:
 *   A. Streaming (real-time): capture <video> element's audio during playback
 *      → WebSocket → backend → DashScope realtime → transcript in QueryInput
 *   B. Full Transcript (batch): POST /api/v1/video/{id}/transcribe
 *      → backend extracts audio via ffmpeg → DashScope non-streaming → full transcript
 *
 * Audio pipeline (streaming mode):
 *   <video> element → AudioContext.createMediaElementSource(video)
 *   → ScriptProcessor(4096, 1, 1) → Float32Array
 *   → WebSocket.send(float32Data.buffer) → Backend → DashScope
 *
 * IMPORTANT:
 *   - processor.connect(audioContext.destination) so audio still plays through speakers
 *   - No getUserMedia() needed (no microphone permission)
 *   - Full Transcript mode uses backend ffmpeg to extract audio server-side
 */

import { useState, useRef, useCallback, useEffect } from 'react';

// ─── Types ──────────────────────────────────────────────────────────────────

interface ASRMessage {
  delta: string;
  full_text: string;
  language: string;
  is_final: boolean;
}

type ASRStatus = 'idle' | 'connecting' | 'streaming' | 'disconnected' | 'error';

interface UseVideoASROptions {
  videoId: string;
  videoElement: HTMLVideoElement | null;
  language?: string;  // "yue" | "zh" | "en" | "auto"
  onFinalTranscript?: (text: string) => void;
}

// ─── Hook: useVideoASR (Streaming Mode) ─────────────────────────────────────

export function useVideoASR({
  videoId,
  videoElement,
  language = 'yue',
  onFinalTranscript,
}: UseVideoASROptions) {
  const [transcript, setTranscript] = useState('');
  const [partialTranscript, setPartialTranscript] = useState('');
  const [status, setStatus] = useState<ASRStatus>('idle');
  const [isStreaming, setIsStreaming] = useState(false);

  const wsRef = useRef<WebSocket | null>(null);
  const audioContextRef = useRef<AudioContext | null>(null);
  const processorRef = useRef<ScriptProcessorNode | null>(null);
  const sourceRef = useRef<MediaElementAudioSourceNode | null>(null);
  const isStreamingRef = useRef(false);

  const getWSURL = useCallback(() => {
    const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
    const host = window.location.host;
    const langParam = language !== 'auto' ? `?language=${language}` : '';
    return `${protocol}//${host}/ws/asr/${videoId}${langParam}`;
  }, [videoId, language]);

  const startStreaming = useCallback(() => {
    if (!videoElement) {
      console.error('No video element available');
      return;
    }
    try {
      setStatus('connecting');

      const audioContext = new AudioContext({ sampleRate: 16000 });
      audioContextRef.current = audioContext;

      const source = audioContext.createMediaElementSource(videoElement);
      sourceRef.current = source;

      const processor = audioContext.createScriptProcessor(4096, 1, 1);
      processorRef.current = processor;

      const ws = new WebSocket(getWSURL());
      wsRef.current = ws;

      ws.onopen = () => {
        isStreamingRef.current = true;
        setIsStreaming(true);
        setStatus('streaming');
      };

      ws.onmessage = (e) => {
        const msg: ASRMessage = JSON.parse(e.data);
        setTranscript(msg.full_text);
        setPartialTranscript(msg.is_final ? '' : msg.full_text);
        if (msg.is_final && msg.full_text.trim()) {
          onFinalTranscript?.(msg.full_text);
        }
      };

      ws.onerror = () => setStatus('error');
      ws.onclose = () => {
        isStreamingRef.current = false;
        setIsStreaming(false);
        setStatus('disconnected');
      };

      processor.onaudioprocess = (e) => {
        if (!isStreamingRef.current) return;
        if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return;
        const float32Data = e.inputBuffer.getChannelData(0);
        wsRef.current.send(float32Data.buffer);
      };

      source.connect(processor);
      processor.connect(audioContext.destination);

    } catch (err) {
      console.error('Video audio capture failed:', err);
      setStatus('error');
    }
  }, [videoElement, getWSURL, onFinalTranscript]);

  const stopStreaming = useCallback(() => {
    isStreamingRef.current = false;
    setIsStreaming(false);
    processorRef.current?.disconnect();
    processorRef.current = null;
    sourceRef.current?.disconnect();
    sourceRef.current = null;
    wsRef.current?.close();
    wsRef.current = null;
    audioContextRef.current?.close();
    audioContextRef.current = null;
    setStatus('idle');
    setPartialTranscript('');
  }, []);

  useEffect(() => {
    return () => {
      isStreamingRef.current = false;
      processorRef.current?.disconnect();
      sourceRef.current?.disconnect();
      wsRef.current?.close();
      audioContextRef.current?.close();
    };
  }, []);

  useEffect(() => {
    if (!videoElement) return;
    const onPlay = () => startStreaming();
    const onPause = () => stopStreaming();
    const onEnded = () => stopStreaming();
    videoElement.addEventListener('play', onPlay);
    videoElement.addEventListener('pause', onPause);
    videoElement.addEventListener('ended', onEnded);
    return () => {
      videoElement.removeEventListener('play', onPlay);
      videoElement.removeEventListener('pause', onPause);
      videoElement.removeEventListener('ended', onEnded);
    };
  }, [videoElement, startStreaming, stopStreaming]);

  return {
    transcript,
    partialTranscript,
    status,
    isStreaming,
    startStreaming,
    stopStreaming,
  };
}


// ─── Hook: useFullTranscript (Batch Mode) ───────────────────────────────────

interface UseFullTranscriptOptions {
  videoId: string;
}

export function useFullTranscript({ videoId }: UseFullTranscriptOptions) {
  const [fullTranscript, setFullTranscript] = useState('');
  const [isLoading, setIsLoading] = useState(false);
  const [error, setError] = useState<string | null>(null);

  const requestFullTranscript = useCallback(async () => {
    setIsLoading(true);
    setError(null);
    try {
      const resp = await fetch(`/api/v1/video/${videoId}/transcribe`, {
        method: 'POST',
      });
      if (!resp.ok) {
        throw new Error(`Server returned ${resp.status}`);
      }
      const data = await resp.json();
      setFullTranscript(data.text);
      return data.text;
    } catch (err) {
      const msg = err instanceof Error ? err.message : 'Transcription failed';
      setError(msg);
      return null;
    } finally {
      setIsLoading(false);
    }
  }, [videoId]);

  return { fullTranscript, isLoading, error, requestFullTranscript };
}


// ─── Usage Example (in LTTPage.tsx) ─────────────────────────────────────────
//
// const videoRef = useRef<HTMLVideoElement>(null);
// const [currentVideoId, setCurrentVideoId] = useState<string | null>(null);
//
// // Streaming ASR (auto-starts on video play, stops on pause/end)
// const asr = useVideoASR({
//   videoId: currentVideoId ?? '',
//   videoElement: videoRef.current,
//   language: 'yue',
//   onFinalTranscript: (text) => {
//     setQueryText(text);  // into QueryInput
//   },
// });
//
// // Full Transcript (manual button)
// const ft = useFullTranscript({ videoId: currentVideoId ?? '' });
//
// return (
//   <>
//     <video ref={videoRef} src={videoUrl} controls />
//     <button onClick={ft.requestFullTranscript} disabled={ft.isLoading}>
//       {ft.isLoading ? 'Transcribing...' : 'Full Transcript'}
//     </button>
//     <QueryInput
//       value={queryText}
//       onChange={setQueryText}
//       onSubmit={handleQuerySubmit}
//       partialText={asr.partialTranscript}
//     />
//   </>
// );