legco_ai_assistant/.examples/alibaba_asr_frontend_react.tsx

245 lines
7.9 KiB
TypeScript

/**
* Reference: React/TypeScript adaptation of VIDEO AUDIO capture for Alibaba Cloud ASR.
*
* Two modes:
* A. Streaming (real-time): capture <video> element's audio during playback
* → WebSocket → backend → DashScope realtime → transcript in QueryInput
* B. Full Transcript (batch): POST /api/v1/video/{id}/transcribe
* → backend extracts audio via ffmpeg → DashScope non-streaming → full transcript
*
* Audio pipeline (streaming mode):
* <video> element → AudioContext.createMediaElementSource(video)
* → ScriptProcessor(4096, 1, 1) → Float32Array
* → WebSocket.send(float32Data.buffer) → Backend → DashScope
*
* IMPORTANT:
* - processor.connect(audioContext.destination) so audio still plays through speakers
* - No getUserMedia() needed (no microphone permission)
* - Full Transcript mode uses backend ffmpeg to extract audio server-side
*/
import { useState, useRef, useCallback, useEffect } from 'react';
// ─── Types ──────────────────────────────────────────────────────────────────
interface ASRMessage {
delta: string;
full_text: string;
language: string;
is_final: boolean;
}
type ASRStatus = 'idle' | 'connecting' | 'streaming' | 'disconnected' | 'error';
interface UseVideoASROptions {
videoId: string;
videoElement: HTMLVideoElement | null;
language?: string; // "yue" | "zh" | "en" | "auto"
onFinalTranscript?: (text: string) => void;
}
// ─── Hook: useVideoASR (Streaming Mode) ─────────────────────────────────────
export function useVideoASR({
videoId,
videoElement,
language = 'yue',
onFinalTranscript,
}: UseVideoASROptions) {
const [transcript, setTranscript] = useState('');
const [partialTranscript, setPartialTranscript] = useState('');
const [status, setStatus] = useState<ASRStatus>('idle');
const [isStreaming, setIsStreaming] = useState(false);
const wsRef = useRef<WebSocket | null>(null);
const audioContextRef = useRef<AudioContext | null>(null);
const processorRef = useRef<ScriptProcessorNode | null>(null);
const sourceRef = useRef<MediaElementAudioSourceNode | null>(null);
const isStreamingRef = useRef(false);
const getWSURL = useCallback(() => {
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
const host = window.location.host;
const langParam = language !== 'auto' ? `?language=${language}` : '';
return `${protocol}//${host}/ws/asr/${videoId}${langParam}`;
}, [videoId, language]);
const startStreaming = useCallback(() => {
if (!videoElement) {
console.error('No video element available');
return;
}
try {
setStatus('connecting');
const audioContext = new AudioContext({ sampleRate: 16000 });
audioContextRef.current = audioContext;
const source = audioContext.createMediaElementSource(videoElement);
sourceRef.current = source;
const processor = audioContext.createScriptProcessor(4096, 1, 1);
processorRef.current = processor;
const ws = new WebSocket(getWSURL());
wsRef.current = ws;
ws.onopen = () => {
isStreamingRef.current = true;
setIsStreaming(true);
setStatus('streaming');
};
ws.onmessage = (e) => {
const msg: ASRMessage = JSON.parse(e.data);
setTranscript(msg.full_text);
setPartialTranscript(msg.is_final ? '' : msg.full_text);
if (msg.is_final && msg.full_text.trim()) {
onFinalTranscript?.(msg.full_text);
}
};
ws.onerror = () => setStatus('error');
ws.onclose = () => {
isStreamingRef.current = false;
setIsStreaming(false);
setStatus('disconnected');
};
processor.onaudioprocess = (e) => {
if (!isStreamingRef.current) return;
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return;
const float32Data = e.inputBuffer.getChannelData(0);
wsRef.current.send(float32Data.buffer);
};
source.connect(processor);
processor.connect(audioContext.destination);
} catch (err) {
console.error('Video audio capture failed:', err);
setStatus('error');
}
}, [videoElement, getWSURL, onFinalTranscript]);
const stopStreaming = useCallback(() => {
isStreamingRef.current = false;
setIsStreaming(false);
processorRef.current?.disconnect();
processorRef.current = null;
sourceRef.current?.disconnect();
sourceRef.current = null;
wsRef.current?.close();
wsRef.current = null;
audioContextRef.current?.close();
audioContextRef.current = null;
setStatus('idle');
setPartialTranscript('');
}, []);
useEffect(() => {
return () => {
isStreamingRef.current = false;
processorRef.current?.disconnect();
sourceRef.current?.disconnect();
wsRef.current?.close();
audioContextRef.current?.close();
};
}, []);
useEffect(() => {
if (!videoElement) return;
const onPlay = () => startStreaming();
const onPause = () => stopStreaming();
const onEnded = () => stopStreaming();
videoElement.addEventListener('play', onPlay);
videoElement.addEventListener('pause', onPause);
videoElement.addEventListener('ended', onEnded);
return () => {
videoElement.removeEventListener('play', onPlay);
videoElement.removeEventListener('pause', onPause);
videoElement.removeEventListener('ended', onEnded);
};
}, [videoElement, startStreaming, stopStreaming]);
return {
transcript,
partialTranscript,
status,
isStreaming,
startStreaming,
stopStreaming,
};
}
// ─── Hook: useFullTranscript (Batch Mode) ───────────────────────────────────
interface UseFullTranscriptOptions {
videoId: string;
}
export function useFullTranscript({ videoId }: UseFullTranscriptOptions) {
const [fullTranscript, setFullTranscript] = useState('');
const [isLoading, setIsLoading] = useState(false);
const [error, setError] = useState<string | null>(null);
const requestFullTranscript = useCallback(async () => {
setIsLoading(true);
setError(null);
try {
const resp = await fetch(`/api/v1/video/${videoId}/transcribe`, {
method: 'POST',
});
if (!resp.ok) {
throw new Error(`Server returned ${resp.status}`);
}
const data = await resp.json();
setFullTranscript(data.text);
return data.text;
} catch (err) {
const msg = err instanceof Error ? err.message : 'Transcription failed';
setError(msg);
return null;
} finally {
setIsLoading(false);
}
}, [videoId]);
return { fullTranscript, isLoading, error, requestFullTranscript };
}
// ─── Usage Example (in LTTPage.tsx) ─────────────────────────────────────────
//
// const videoRef = useRef<HTMLVideoElement>(null);
// const [currentVideoId, setCurrentVideoId] = useState<string | null>(null);
//
// // Streaming ASR (auto-starts on video play, stops on pause/end)
// const asr = useVideoASR({
// videoId: currentVideoId ?? '',
// videoElement: videoRef.current,
// language: 'yue',
// onFinalTranscript: (text) => {
// setQueryText(text); // into QueryInput
// },
// });
//
// // Full Transcript (manual button)
// const ft = useFullTranscript({ videoId: currentVideoId ?? '' });
//
// return (
// <>
// <video ref={videoRef} src={videoUrl} controls />
// <button onClick={ft.requestFullTranscript} disabled={ft.isLoading}>
// {ft.isLoading ? 'Transcribing...' : 'Full Transcript'}
// </button>
// <QueryInput
// value={queryText}
// onChange={setQueryText}
// onSubmit={handleQuerySubmit}
// partialText={asr.partialTranscript}
// />
// </>
// );