legco_ai_assistant/frontend/src/hooks/useVideoASR.ts

177 lines
5.4 KiB
TypeScript

import { useState, useRef, useCallback, useEffect } from 'react'
import type { ASRMessage, ASRStatus } from '../types'
interface UseVideoASROptions {
videoId: string
videoElement: HTMLVideoElement | null
language?: string
onFinalTranscript?: (text: string) => void
}
export function useVideoASR({
videoId,
videoElement,
language = 'yue',
onFinalTranscript,
}: UseVideoASROptions) {
const [transcript, setTranscript] = useState('')
const [partialTranscript, setPartialTranscript] = useState('')
const [status, setStatus] = useState<ASRStatus>('idle')
const [isStreaming, setIsStreaming] = useState(false)
const wsRef = useRef<WebSocket | null>(null)
const audioContextRef = useRef<AudioContext | null>(null)
const processorRef = useRef<ScriptProcessorNode | null>(null)
const sourceRef = useRef<MediaElementAudioSourceNode | null>(null)
const isStreamingRef = useRef(false)
const graphSetupRef = useRef(false)
const transcriptRef = useRef('')
const lastStashRef = useRef('')
const onFinalTranscriptRef = useRef(onFinalTranscript)
onFinalTranscriptRef.current = onFinalTranscript
const getWSURL = useCallback(() => {
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
const host = window.location.host
const langParam = language !== 'auto' ? `?language=${language}` : ''
const backendHost = import.meta.env.VITE_WS_HOST ?? host
return `${protocol}//${backendHost}/ws/asr/${videoId}${langParam}`
}, [videoId, language])
const connectWebSocket = useCallback(() => {
const ws = new WebSocket(getWSURL())
wsRef.current = ws
ws.onopen = () => {
isStreamingRef.current = true
setIsStreaming(true)
setStatus('streaming')
}
ws.onmessage = (e) => {
const msg: ASRMessage = JSON.parse(e.data)
if (msg.is_final && msg.full_text) {
transcriptRef.current = msg.full_text
lastStashRef.current = ''
setTranscript(msg.full_text)
setPartialTranscript('')
onFinalTranscriptRef.current?.(msg.full_text)
} else if (msg.delta) {
transcriptRef.current += msg.delta
lastStashRef.current = (msg as any).stash || ''
setTranscript(transcriptRef.current)
setPartialTranscript(transcriptRef.current)
}
}
ws.onerror = (e) => {
console.error('[useVideoASR] WebSocket error:', e)
setStatus('error')
}
ws.onclose = () => {
isStreamingRef.current = false
setIsStreaming(false)
setStatus('disconnected')
}
}, [getWSURL])
const closeWebSocket = useCallback(() => {
wsRef.current?.close()
wsRef.current = null
}, [])
const startStreaming = useCallback(() => {
if (!videoElement) return
try {
setStatus('connecting')
audioContextRef.current?.resume()
closeWebSocket()
connectWebSocket()
} catch (err) {
console.error('[useVideoASR] startStreaming failed:', err)
setStatus('error')
}
}, [videoElement, closeWebSocket, connectWebSocket])
const stopStreaming = useCallback(() => {
isStreamingRef.current = false
setIsStreaming(false)
closeWebSocket()
setStatus('idle')
let currentText = transcriptRef.current.trim()
const stash = lastStashRef.current.trim()
if (stash && !currentText.endsWith(stash)) {
currentText += stash
transcriptRef.current = currentText
}
lastStashRef.current = ''
if (currentText) {
onFinalTranscriptRef.current?.(currentText)
setPartialTranscript('')
}
}, [closeWebSocket])
useEffect(() => {
if (!videoElement || graphSetupRef.current) return
try {
const audioContext = new AudioContext({ sampleRate: 16000 })
audioContextRef.current = audioContext
const source = audioContext.createMediaElementSource(videoElement)
sourceRef.current = source
const processor = audioContext.createScriptProcessor(4096, 1, 1)
processorRef.current = processor
processor.onaudioprocess = (e) => {
const float32Data = e.inputBuffer.getChannelData(0)
const outputData = e.outputBuffer.getChannelData(0)
outputData.set(float32Data)
if (!isStreamingRef.current) return
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return
wsRef.current.send(float32Data.buffer)
}
source.connect(processor)
processor.connect(audioContext.destination)
graphSetupRef.current = true
} catch (err) {
console.error('[useVideoASR] audio graph setup failed:', err)
}
}, [videoElement])
useEffect(() => {
return () => {
isStreamingRef.current = false
processorRef.current?.disconnect()
sourceRef.current?.disconnect()
wsRef.current?.close()
audioContextRef.current?.close()
}
}, [])
useEffect(() => {
if (!videoElement) return
const onPlay = () => startStreaming()
const onPause = () => stopStreaming()
const onEnded = () => stopStreaming()
videoElement.addEventListener('play', onPlay)
videoElement.addEventListener('pause', onPause)
videoElement.addEventListener('ended', onEnded)
return () => {
videoElement.removeEventListener('play', onPlay)
videoElement.removeEventListener('pause', onPause)
videoElement.removeEventListener('ended', onEnded)
}
}, [videoElement, startStreaming, stopStreaming])
return {
transcript,
partialTranscript,
isStreaming,
status,
startStreaming,
stopStreaming,
}
}