import { useState, useRef, useCallback, useEffect } from 'react' import type { ASRMessage, ASRStatus } from '../types' interface UseVideoASROptions { videoId: string videoElement: HTMLVideoElement | null language?: string onFinalTranscript?: (text: string) => void } export function useVideoASR({ videoId, videoElement, language = 'yue', onFinalTranscript, }: UseVideoASROptions) { const [transcript, setTranscript] = useState('') const [partialTranscript, setPartialTranscript] = useState('') const [status, setStatus] = useState('idle') const [isStreaming, setIsStreaming] = useState(false) const wsRef = useRef(null) const audioContextRef = useRef(null) const processorRef = useRef(null) const sourceRef = useRef(null) const isStreamingRef = useRef(false) const graphSetupRef = useRef(false) const transcriptRef = useRef('') const lastStashRef = useRef('') const onFinalTranscriptRef = useRef(onFinalTranscript) onFinalTranscriptRef.current = onFinalTranscript const getWSURL = useCallback(() => { const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:' const host = window.location.host const langParam = language !== 'auto' ? `?language=${language}` : '' const backendHost = import.meta.env.VITE_WS_HOST ?? host return `${protocol}//${backendHost}/ws/asr/${videoId}${langParam}` }, [videoId, language]) const connectWebSocket = useCallback(() => { const ws = new WebSocket(getWSURL()) wsRef.current = ws ws.onopen = () => { isStreamingRef.current = true setIsStreaming(true) setStatus('streaming') } ws.onmessage = (e) => { const msg: ASRMessage = JSON.parse(e.data) if (msg.is_final && msg.full_text) { transcriptRef.current = msg.full_text lastStashRef.current = '' setTranscript(msg.full_text) setPartialTranscript('') onFinalTranscriptRef.current?.(msg.full_text) } else if (msg.delta) { transcriptRef.current += msg.delta lastStashRef.current = (msg as any).stash || '' setTranscript(transcriptRef.current) setPartialTranscript(transcriptRef.current) } } ws.onerror = (e) => { console.error('[useVideoASR] WebSocket error:', e) setStatus('error') } ws.onclose = () => { isStreamingRef.current = false setIsStreaming(false) setStatus('disconnected') } }, [getWSURL]) const closeWebSocket = useCallback(() => { wsRef.current?.close() wsRef.current = null }, []) const startStreaming = useCallback(() => { if (!videoElement) return try { setStatus('connecting') audioContextRef.current?.resume() closeWebSocket() connectWebSocket() } catch (err) { console.error('[useVideoASR] startStreaming failed:', err) setStatus('error') } }, [videoElement, closeWebSocket, connectWebSocket]) const stopStreaming = useCallback(() => { isStreamingRef.current = false setIsStreaming(false) closeWebSocket() setStatus('idle') let currentText = transcriptRef.current.trim() const stash = lastStashRef.current.trim() if (stash && !currentText.endsWith(stash)) { currentText += stash transcriptRef.current = currentText } lastStashRef.current = '' if (currentText) { onFinalTranscriptRef.current?.(currentText) setPartialTranscript('') } }, [closeWebSocket]) useEffect(() => { if (!videoElement || graphSetupRef.current) return try { const audioContext = new AudioContext({ sampleRate: 16000 }) audioContextRef.current = audioContext const source = audioContext.createMediaElementSource(videoElement) sourceRef.current = source const processor = audioContext.createScriptProcessor(4096, 1, 1) processorRef.current = processor processor.onaudioprocess = (e) => { const float32Data = e.inputBuffer.getChannelData(0) const outputData = e.outputBuffer.getChannelData(0) outputData.set(float32Data) if (!isStreamingRef.current) return if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return wsRef.current.send(float32Data.buffer) } source.connect(processor) processor.connect(audioContext.destination) graphSetupRef.current = true } catch (err) { console.error('[useVideoASR] audio graph setup failed:', err) } }, [videoElement]) useEffect(() => { return () => { isStreamingRef.current = false processorRef.current?.disconnect() sourceRef.current?.disconnect() wsRef.current?.close() audioContextRef.current?.close() } }, []) useEffect(() => { if (!videoElement) return const onPlay = () => startStreaming() const onPause = () => stopStreaming() const onEnded = () => stopStreaming() videoElement.addEventListener('play', onPlay) videoElement.addEventListener('pause', onPause) videoElement.addEventListener('ended', onEnded) return () => { videoElement.removeEventListener('play', onPlay) videoElement.removeEventListener('pause', onPause) videoElement.removeEventListener('ended', onEnded) } }, [videoElement, startStreaming, stopStreaming]) return { transcript, partialTranscript, isStreaming, status, startStreaming, stopStreaming, } }