179 lines
5.5 KiB
TypeScript
179 lines
5.5 KiB
TypeScript
import { useState, useRef, useCallback, useEffect } from 'react'
|
|
import type { ASRMessage, ASRStatus } from '../types'
|
|
|
|
interface UseYouTubeASROptions {
|
|
videoId: string
|
|
videoElement: HTMLVideoElement | null
|
|
audioElement: HTMLAudioElement | null
|
|
language?: string
|
|
onFinalTranscript?: (text: string) => void
|
|
}
|
|
|
|
export function useYouTubeASR({
|
|
videoId,
|
|
videoElement,
|
|
audioElement,
|
|
language = 'yue',
|
|
onFinalTranscript,
|
|
}: UseYouTubeASROptions) {
|
|
const [transcript, setTranscript] = useState('')
|
|
const [partialTranscript, setPartialTranscript] = useState('')
|
|
const [status, setStatus] = useState<ASRStatus>('idle')
|
|
const [isStreaming, setIsStreaming] = useState(false)
|
|
|
|
const wsRef = useRef<WebSocket | null>(null)
|
|
const audioContextRef = useRef<AudioContext | null>(null)
|
|
const processorRef = useRef<ScriptProcessorNode | null>(null)
|
|
const sourceRef = useRef<MediaElementAudioSourceNode | null>(null)
|
|
const isStreamingRef = useRef(false)
|
|
const graphSetupRef = useRef(false)
|
|
const transcriptRef = useRef('')
|
|
const lastStashRef = useRef('')
|
|
const onFinalTranscriptRef = useRef(onFinalTranscript)
|
|
onFinalTranscriptRef.current = onFinalTranscript
|
|
|
|
const getWSURL = useCallback(() => {
|
|
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
|
|
const host = window.location.host
|
|
const langParam = language !== 'auto' ? `?language=${language}` : ''
|
|
const backendHost = import.meta.env.VITE_WS_HOST ?? host
|
|
return `${protocol}//${backendHost}/ws/asr/${videoId}${langParam}`
|
|
}, [videoId, language])
|
|
|
|
const connectWebSocket = useCallback(() => {
|
|
const ws = new WebSocket(getWSURL())
|
|
wsRef.current = ws
|
|
|
|
ws.onopen = () => {
|
|
isStreamingRef.current = true
|
|
setIsStreaming(true)
|
|
setStatus('streaming')
|
|
}
|
|
|
|
ws.onmessage = (e) => {
|
|
const msg: ASRMessage = JSON.parse(e.data)
|
|
if (msg.is_final && msg.full_text) {
|
|
transcriptRef.current = msg.full_text
|
|
lastStashRef.current = ''
|
|
setTranscript(msg.full_text)
|
|
setPartialTranscript('')
|
|
onFinalTranscriptRef.current?.(msg.full_text)
|
|
} else if (msg.delta) {
|
|
transcriptRef.current += msg.delta
|
|
lastStashRef.current = (msg as any).stash || ''
|
|
setTranscript(transcriptRef.current)
|
|
setPartialTranscript(transcriptRef.current)
|
|
}
|
|
}
|
|
|
|
ws.onerror = (e) => {
|
|
console.error('[useYouTubeASR] WebSocket error:', e)
|
|
setStatus('error')
|
|
}
|
|
ws.onclose = () => {
|
|
isStreamingRef.current = false
|
|
setIsStreaming(false)
|
|
setStatus('disconnected')
|
|
}
|
|
}, [getWSURL])
|
|
|
|
const closeWebSocket = useCallback(() => {
|
|
wsRef.current?.close()
|
|
wsRef.current = null
|
|
}, [])
|
|
|
|
const startStreaming = useCallback(() => {
|
|
if (!audioElement) return
|
|
try {
|
|
setStatus('connecting')
|
|
audioContextRef.current?.resume()
|
|
closeWebSocket()
|
|
connectWebSocket()
|
|
} catch (err) {
|
|
console.error('[useYouTubeASR] startStreaming failed:', err)
|
|
setStatus('error')
|
|
}
|
|
}, [audioElement, closeWebSocket, connectWebSocket])
|
|
|
|
const stopStreaming = useCallback(() => {
|
|
isStreamingRef.current = false
|
|
setIsStreaming(false)
|
|
closeWebSocket()
|
|
setStatus('idle')
|
|
let currentText = transcriptRef.current.trim()
|
|
const stash = lastStashRef.current.trim()
|
|
if (stash && !currentText.endsWith(stash)) {
|
|
currentText += stash
|
|
transcriptRef.current = currentText
|
|
}
|
|
lastStashRef.current = ''
|
|
if (currentText) {
|
|
onFinalTranscriptRef.current?.(currentText)
|
|
setPartialTranscript('')
|
|
}
|
|
}, [closeWebSocket])
|
|
|
|
useEffect(() => {
|
|
if (!audioElement || graphSetupRef.current) return
|
|
try {
|
|
const audioContext = new AudioContext({ sampleRate: 16000 })
|
|
audioContextRef.current = audioContext
|
|
|
|
const source = audioContext.createMediaElementSource(audioElement)
|
|
sourceRef.current = source
|
|
|
|
const processor = audioContext.createScriptProcessor(4096, 1, 1)
|
|
processorRef.current = processor
|
|
|
|
processor.onaudioprocess = (e) => {
|
|
const float32Data = e.inputBuffer.getChannelData(0)
|
|
const outputData = e.outputBuffer.getChannelData(0)
|
|
outputData.set(float32Data)
|
|
if (!isStreamingRef.current) return
|
|
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return
|
|
wsRef.current.send(float32Data.buffer)
|
|
}
|
|
|
|
source.connect(processor)
|
|
processor.connect(audioContext.destination)
|
|
graphSetupRef.current = true
|
|
} catch (err) {
|
|
console.error('[useYouTubeASR] audio graph setup failed:', err)
|
|
}
|
|
}, [audioElement])
|
|
|
|
useEffect(() => {
|
|
return () => {
|
|
isStreamingRef.current = false
|
|
processorRef.current?.disconnect()
|
|
sourceRef.current?.disconnect()
|
|
wsRef.current?.close()
|
|
audioContextRef.current?.close()
|
|
}
|
|
}, [])
|
|
|
|
useEffect(() => {
|
|
if (!videoElement) return
|
|
const onPlay = () => startStreaming()
|
|
const onPause = () => stopStreaming()
|
|
const onEnded = () => stopStreaming()
|
|
videoElement.addEventListener('play', onPlay)
|
|
videoElement.addEventListener('pause', onPause)
|
|
videoElement.addEventListener('ended', onEnded)
|
|
return () => {
|
|
videoElement.removeEventListener('play', onPlay)
|
|
videoElement.removeEventListener('pause', onPause)
|
|
videoElement.removeEventListener('ended', onEnded)
|
|
}
|
|
}, [videoElement, startStreaming, stopStreaming])
|
|
|
|
return {
|
|
transcript,
|
|
partialTranscript,
|
|
isStreaming,
|
|
status,
|
|
startStreaming,
|
|
stopStreaming,
|
|
}
|
|
}
|