245 lines
7.9 KiB
TypeScript
245 lines
7.9 KiB
TypeScript
/**
|
|
* Reference: React/TypeScript adaptation of VIDEO AUDIO capture for Alibaba Cloud ASR.
|
|
*
|
|
* Two modes:
|
|
* A. Streaming (real-time): capture <video> element's audio during playback
|
|
* → WebSocket → backend → DashScope realtime → transcript in QueryInput
|
|
* B. Full Transcript (batch): POST /api/v1/video/{id}/transcribe
|
|
* → backend extracts audio via ffmpeg → DashScope non-streaming → full transcript
|
|
*
|
|
* Audio pipeline (streaming mode):
|
|
* <video> element → AudioContext.createMediaElementSource(video)
|
|
* → ScriptProcessor(4096, 1, 1) → Float32Array
|
|
* → WebSocket.send(float32Data.buffer) → Backend → DashScope
|
|
*
|
|
* IMPORTANT:
|
|
* - processor.connect(audioContext.destination) so audio still plays through speakers
|
|
* - No getUserMedia() needed (no microphone permission)
|
|
* - Full Transcript mode uses backend ffmpeg to extract audio server-side
|
|
*/
|
|
|
|
import { useState, useRef, useCallback, useEffect } from 'react';
|
|
|
|
// ─── Types ──────────────────────────────────────────────────────────────────
|
|
|
|
interface ASRMessage {
|
|
delta: string;
|
|
full_text: string;
|
|
language: string;
|
|
is_final: boolean;
|
|
}
|
|
|
|
type ASRStatus = 'idle' | 'connecting' | 'streaming' | 'disconnected' | 'error';
|
|
|
|
interface UseVideoASROptions {
|
|
videoId: string;
|
|
videoElement: HTMLVideoElement | null;
|
|
language?: string; // "yue" | "zh" | "en" | "auto"
|
|
onFinalTranscript?: (text: string) => void;
|
|
}
|
|
|
|
// ─── Hook: useVideoASR (Streaming Mode) ─────────────────────────────────────
|
|
|
|
export function useVideoASR({
|
|
videoId,
|
|
videoElement,
|
|
language = 'yue',
|
|
onFinalTranscript,
|
|
}: UseVideoASROptions) {
|
|
const [transcript, setTranscript] = useState('');
|
|
const [partialTranscript, setPartialTranscript] = useState('');
|
|
const [status, setStatus] = useState<ASRStatus>('idle');
|
|
const [isStreaming, setIsStreaming] = useState(false);
|
|
|
|
const wsRef = useRef<WebSocket | null>(null);
|
|
const audioContextRef = useRef<AudioContext | null>(null);
|
|
const processorRef = useRef<ScriptProcessorNode | null>(null);
|
|
const sourceRef = useRef<MediaElementAudioSourceNode | null>(null);
|
|
const isStreamingRef = useRef(false);
|
|
|
|
const getWSURL = useCallback(() => {
|
|
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
|
|
const host = window.location.host;
|
|
const langParam = language !== 'auto' ? `?language=${language}` : '';
|
|
return `${protocol}//${host}/ws/asr/${videoId}${langParam}`;
|
|
}, [videoId, language]);
|
|
|
|
const startStreaming = useCallback(() => {
|
|
if (!videoElement) {
|
|
console.error('No video element available');
|
|
return;
|
|
}
|
|
try {
|
|
setStatus('connecting');
|
|
|
|
const audioContext = new AudioContext({ sampleRate: 16000 });
|
|
audioContextRef.current = audioContext;
|
|
|
|
const source = audioContext.createMediaElementSource(videoElement);
|
|
sourceRef.current = source;
|
|
|
|
const processor = audioContext.createScriptProcessor(4096, 1, 1);
|
|
processorRef.current = processor;
|
|
|
|
const ws = new WebSocket(getWSURL());
|
|
wsRef.current = ws;
|
|
|
|
ws.onopen = () => {
|
|
isStreamingRef.current = true;
|
|
setIsStreaming(true);
|
|
setStatus('streaming');
|
|
};
|
|
|
|
ws.onmessage = (e) => {
|
|
const msg: ASRMessage = JSON.parse(e.data);
|
|
setTranscript(msg.full_text);
|
|
setPartialTranscript(msg.is_final ? '' : msg.full_text);
|
|
if (msg.is_final && msg.full_text.trim()) {
|
|
onFinalTranscript?.(msg.full_text);
|
|
}
|
|
};
|
|
|
|
ws.onerror = () => setStatus('error');
|
|
ws.onclose = () => {
|
|
isStreamingRef.current = false;
|
|
setIsStreaming(false);
|
|
setStatus('disconnected');
|
|
};
|
|
|
|
processor.onaudioprocess = (e) => {
|
|
if (!isStreamingRef.current) return;
|
|
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return;
|
|
const float32Data = e.inputBuffer.getChannelData(0);
|
|
wsRef.current.send(float32Data.buffer);
|
|
};
|
|
|
|
source.connect(processor);
|
|
processor.connect(audioContext.destination);
|
|
|
|
} catch (err) {
|
|
console.error('Video audio capture failed:', err);
|
|
setStatus('error');
|
|
}
|
|
}, [videoElement, getWSURL, onFinalTranscript]);
|
|
|
|
const stopStreaming = useCallback(() => {
|
|
isStreamingRef.current = false;
|
|
setIsStreaming(false);
|
|
processorRef.current?.disconnect();
|
|
processorRef.current = null;
|
|
sourceRef.current?.disconnect();
|
|
sourceRef.current = null;
|
|
wsRef.current?.close();
|
|
wsRef.current = null;
|
|
audioContextRef.current?.close();
|
|
audioContextRef.current = null;
|
|
setStatus('idle');
|
|
setPartialTranscript('');
|
|
}, []);
|
|
|
|
useEffect(() => {
|
|
return () => {
|
|
isStreamingRef.current = false;
|
|
processorRef.current?.disconnect();
|
|
sourceRef.current?.disconnect();
|
|
wsRef.current?.close();
|
|
audioContextRef.current?.close();
|
|
};
|
|
}, []);
|
|
|
|
useEffect(() => {
|
|
if (!videoElement) return;
|
|
const onPlay = () => startStreaming();
|
|
const onPause = () => stopStreaming();
|
|
const onEnded = () => stopStreaming();
|
|
videoElement.addEventListener('play', onPlay);
|
|
videoElement.addEventListener('pause', onPause);
|
|
videoElement.addEventListener('ended', onEnded);
|
|
return () => {
|
|
videoElement.removeEventListener('play', onPlay);
|
|
videoElement.removeEventListener('pause', onPause);
|
|
videoElement.removeEventListener('ended', onEnded);
|
|
};
|
|
}, [videoElement, startStreaming, stopStreaming]);
|
|
|
|
return {
|
|
transcript,
|
|
partialTranscript,
|
|
status,
|
|
isStreaming,
|
|
startStreaming,
|
|
stopStreaming,
|
|
};
|
|
}
|
|
|
|
|
|
// ─── Hook: useFullTranscript (Batch Mode) ───────────────────────────────────
|
|
|
|
interface UseFullTranscriptOptions {
|
|
videoId: string;
|
|
}
|
|
|
|
export function useFullTranscript({ videoId }: UseFullTranscriptOptions) {
|
|
const [fullTranscript, setFullTranscript] = useState('');
|
|
const [isLoading, setIsLoading] = useState(false);
|
|
const [error, setError] = useState<string | null>(null);
|
|
|
|
const requestFullTranscript = useCallback(async () => {
|
|
setIsLoading(true);
|
|
setError(null);
|
|
try {
|
|
const resp = await fetch(`/api/v1/video/${videoId}/transcribe`, {
|
|
method: 'POST',
|
|
});
|
|
if (!resp.ok) {
|
|
throw new Error(`Server returned ${resp.status}`);
|
|
}
|
|
const data = await resp.json();
|
|
setFullTranscript(data.text);
|
|
return data.text;
|
|
} catch (err) {
|
|
const msg = err instanceof Error ? err.message : 'Transcription failed';
|
|
setError(msg);
|
|
return null;
|
|
} finally {
|
|
setIsLoading(false);
|
|
}
|
|
}, [videoId]);
|
|
|
|
return { fullTranscript, isLoading, error, requestFullTranscript };
|
|
}
|
|
|
|
|
|
// ─── Usage Example (in LTTPage.tsx) ─────────────────────────────────────────
|
|
//
|
|
// const videoRef = useRef<HTMLVideoElement>(null);
|
|
// const [currentVideoId, setCurrentVideoId] = useState<string | null>(null);
|
|
//
|
|
// // Streaming ASR (auto-starts on video play, stops on pause/end)
|
|
// const asr = useVideoASR({
|
|
// videoId: currentVideoId ?? '',
|
|
// videoElement: videoRef.current,
|
|
// language: 'yue',
|
|
// onFinalTranscript: (text) => {
|
|
// setQueryText(text); // into QueryInput
|
|
// },
|
|
// });
|
|
//
|
|
// // Full Transcript (manual button)
|
|
// const ft = useFullTranscript({ videoId: currentVideoId ?? '' });
|
|
//
|
|
// return (
|
|
// <>
|
|
// <video ref={videoRef} src={videoUrl} controls />
|
|
// <button onClick={ft.requestFullTranscript} disabled={ft.isLoading}>
|
|
// {ft.isLoading ? 'Transcribing...' : 'Full Transcript'}
|
|
// </button>
|
|
// <QueryInput
|
|
// value={queryText}
|
|
// onChange={setQueryText}
|
|
// onSubmit={handleQuerySubmit}
|
|
// partialText={asr.partialTranscript}
|
|
// />
|
|
// </>
|
|
// );
|