fix: text accumulation — stashes are sliding windows, merge via overlap detection
DashScope stashes are ~7-char rolling windows, not cumulative. Each partial
event replaces the previous. Completed events rarely sent. This caused text to
jump/replace during streaming and disappear on pause.
Backend:
- Add _merge_stash() — finds overlapping suffix between successive stashes
and appends only new characters, reconstructing full utterance from partials
- format_transcription_event returns raw stash for read_events to merge
- read_events maintains partial_buffer via _merge_stash, clears on completed
- Guard against empty/whitespace-only stashes
Frontend:
- transcriptRef + onFinalTranscriptRef avoid stale closures in pause handler
- stopStreaming fires onFinalTranscript(currentText) before clearing partial
- Removed blind setPartialTranscript('') that erased text on pause
Tests: 16/16 ws_protocol tests pass, frontend tests unchanged
Plan: Updated phase2_implementation_plan.md to Complete with 11-bug log
This commit is contained in:
parent
fcb9ec1f6c
commit
cb0ac07786
|
|
@ -1,8 +1,8 @@
|
|||
# Phase 2: Video Upload + Video Audio ASR → RAG — Implementation Plan
|
||||
|
||||
**Created:** 2026-05-06
|
||||
**Updated:** 2026-05-06 (video audio capture via createMediaElementSource; Full Transcript batch mode)
|
||||
**Status:** Planning — Not Started
|
||||
**Updated:** 2026-05-06 (all sub-phases complete; 11 bugs resolved)
|
||||
**Status:** Complete
|
||||
**Depends on:** Phase 1 (Complete)
|
||||
|
||||
---
|
||||
|
|
@ -311,7 +311,39 @@ frontend/src/pages/LTTPage.tsx, components/QueryInput.tsx, lib/api.ts, lib/queri
|
|||
|
||||
---
|
||||
|
||||
## 9. Reference Code (`.examples/`)
|
||||
## 10. Bugs Resolved (11 Total)
|
||||
|
||||
| # | Bug | Root Cause | Fix |
|
||||
|---|-----|-----------|-----|
|
||||
| 1 | Vite proxy missing | `/api` and `/ws` routes not proxied to backend port 8000 | Added proxy config in `vite.config.ts` |
|
||||
| 2 | `crossOrigin` missing on `<video>` | Cross-origin video blocked `createMediaElementSource` | Added `crossOrigin="anonymous"` to `VideoPlayer.tsx` |
|
||||
| 3 | Audio silent (consumed but not passed through) | `onaudioprocess` didn't copy input to output | Added `outputData.set(float32Data)` |
|
||||
| 4 | Audio graph recreated every play/pause | `createMediaElementSource` can only be called once per element | Setup audio graph once per video lifetime (`graphSetupRef`) |
|
||||
| 5 | `AudioContext` suspended on play | Autoplay policy requires user gesture | Added `audioContext.resume()` on play |
|
||||
| 6 | `dashscope` package not installed | Missing from `requirements.txt` | Installed `dashscope>=0.4.0` |
|
||||
| 7 | `api_key` not passed to DashScope | `OmniRealtimeConversation` needs explicit `api_key` | Added `api_key=settings.dashscope_api_key` |
|
||||
| 8 | `data_url` MIME type wrong | `data:audio/wav;base64` instead of `data:audio/wav;base64,` | Fixed data URL format in `transcribe_full()` |
|
||||
| 9 | `extra_body` sent when `language="auto"` | DashScope rejects `extra_body` with auto-detect | Omitted `extra_body` entirely when `language="auto"` |
|
||||
| 10 | Text accumulation broken — stashes are sliding windows | DashScope `stash` is a rolling ~7-char window, not cumulative. Each partial event replaces the previous. Completed events rarely sent. Old code showed partials as-is (text jumped). Simplification silently dropped partials (nothing appeared). | Added `_merge_stash()` — finds overlapping suffix between successive stashes and appends only the new chars. Partial events now contribute to a growing `partial_buffer`. `full_text = accumulated + partial_buffer`. |
|
||||
| 11 | Text disappeared on pause | `stopStreaming` set `partialTranscript = ''` and `onFinalTranscript` never fired (no completed events). | Removed blind clear; added `transcriptRef` to avoid stale closures; `stopStreaming` now fires `onFinalTranscript(currentText)` then clears partial. |
|
||||
|
||||
### Text Accumulation Architecture (Bug #10 Detail)
|
||||
|
||||
```
|
||||
Raw DashScope stashes (sliding window, ~7 chars each):
|
||||
"系多謝主席" → "主席咁咧呢個" → "呢個古洞北" → "三百二十五億"
|
||||
|
||||
_merge_stash() reconstruction:
|
||||
"" + "系多謝主席" → "系多謝主席"
|
||||
"系多謝主席" + overlap("主席")="咁咧呢個" → "系多謝主席咁咧呢個"
|
||||
"系多謝主席咁咧呢個" + overlap("呢個")="古洞北" → "系多謝主席咁咧呢個古洞北"
|
||||
"系多謝主席咁咧呢個古洞北" + no overlap → append with space → "系多謝主席咁咧呢個古洞北 三百二十五億"
|
||||
|
||||
Sent to frontend: {"full_text": accumulated + partial_buffer, "is_final": false}
|
||||
On pause: onFinalTranscript fires with current text, partialTranscript cleared
|
||||
```
|
||||
|
||||
## 11. Reference Code (`.examples/`)
|
||||
|
||||
| File | Content |
|
||||
|---|---|
|
||||
|
|
|
|||
|
|
@ -48,15 +48,25 @@ class DashScopeCallback(OmniRealtimeCallback):
|
|||
logger.info("dashscope-connection-closed code=%s msg=%s", code, msg)
|
||||
|
||||
|
||||
def _merge_stash(partial_buffer: str, new_stash: str) -> str:
|
||||
if not new_stash.strip():
|
||||
return partial_buffer
|
||||
if not partial_buffer:
|
||||
return new_stash
|
||||
for i in range(min(len(partial_buffer), len(new_stash)), 0, -1):
|
||||
if partial_buffer[-i:] == new_stash[:i]:
|
||||
return partial_buffer + new_stash[i:]
|
||||
return partial_buffer + " " + new_stash
|
||||
|
||||
|
||||
def format_transcription_event(event: dict, accumulated: str) -> dict | None:
|
||||
event_type = event.get("type", "")
|
||||
|
||||
if event_type == "conversation.item.input_audio_transcription.text":
|
||||
stash = event.get("stash", "")
|
||||
display = build_display_text(accumulated, stash) if stash else accumulated
|
||||
return {
|
||||
"delta": "",
|
||||
"full_text": _to_traditional(display),
|
||||
"stash": stash,
|
||||
"language": event.get("language", "yue"),
|
||||
"is_final": False,
|
||||
}
|
||||
|
|
@ -104,26 +114,29 @@ async def _ws_proxy_dashscope(client_ws: WebSocket, loop: asyncio.AbstractEventL
|
|||
logger.info("dashscope-session-updated lang=%s", language)
|
||||
|
||||
accumulated_text = ""
|
||||
partial_buffer = ""
|
||||
chunk_count = 0
|
||||
|
||||
async def read_events():
|
||||
nonlocal accumulated_text
|
||||
nonlocal accumulated_text, partial_buffer
|
||||
while True:
|
||||
event = await event_queue.get()
|
||||
result = format_transcription_event(event, accumulated_text)
|
||||
if result is not None:
|
||||
if result is None:
|
||||
continue
|
||||
if result["is_final"]:
|
||||
event_type = event.get("type", "")
|
||||
if event_type == "conversation.item.input_audio_transcription.completed":
|
||||
transcript = event.get("transcript", "")
|
||||
if transcript and transcript.strip():
|
||||
accumulated_text = build_display_text(accumulated_text, transcript)
|
||||
logger.info(
|
||||
"dashscope-utterance-completed text_len=%d lang=%s",
|
||||
len(accumulated_text),
|
||||
result.get("language", "yue"),
|
||||
)
|
||||
partial_buffer = ""
|
||||
result["full_text"] = _to_traditional(accumulated_text)
|
||||
logger.info("dashscope-utterance-completed text_len=%d lang=%s", len(accumulated_text), result.get("language", "yue"))
|
||||
else:
|
||||
stash = result.pop("stash", "")
|
||||
if stash.strip():
|
||||
partial_buffer = _merge_stash(partial_buffer, stash)
|
||||
display = build_display_text(accumulated_text, partial_buffer)
|
||||
result["full_text"] = _to_traditional(display)
|
||||
await client_ws.send_json(result)
|
||||
|
||||
read_task = asyncio.create_task(read_events())
|
||||
|
|
@ -182,5 +195,9 @@ async def ws_asr_endpoint(websocket: WebSocket, video_id: str, language: str = "
|
|||
await _ws_proxy_dashscope(websocket, loop, language)
|
||||
except Exception as e:
|
||||
logger.error("ws-asr-error video_id=%s error=%s", video_id, e)
|
||||
try:
|
||||
await websocket.send_json({"error": "ASR service unavailable", "detail": str(e)})
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
logger.info("ws-disconnect video_id=%s", video_id)
|
||||
|
|
|
|||
|
|
@ -75,9 +75,47 @@ class TestDashScopeCallback:
|
|||
loop.close()
|
||||
|
||||
|
||||
class TestMergeStash:
|
||||
def test_merge_empty_buffer_returns_stash(self):
|
||||
from app.routers.ws_asr import _merge_stash
|
||||
|
||||
assert _merge_stash("", "你好") == "你好"
|
||||
|
||||
def test_merge_overlapping_suffix(self):
|
||||
from app.routers.ws_asr import _merge_stash
|
||||
|
||||
assert _merge_stash("系多謝主席", "主席咁咧呢個") == "系多謝主席咁咧呢個"
|
||||
|
||||
def test_merge_overlapping_single_char(self):
|
||||
from app.routers.ws_asr import _merge_stash
|
||||
|
||||
assert _merge_stash("abcde", "efgh") == "abcdefgh"
|
||||
|
||||
def test_merge_no_overlap_appends_with_space(self):
|
||||
from app.routers.ws_asr import _merge_stash
|
||||
|
||||
assert _merge_stash("你好", "世界") == "你好 世界"
|
||||
|
||||
def test_merge_stash_subset_of_buffer(self):
|
||||
from app.routers.ws_asr import _merge_stash
|
||||
|
||||
assert _merge_stash("系多謝主席咁咧", "咧呢") == "系多謝主席咁咧呢"
|
||||
|
||||
def test_merge_empty_stash_preserves_buffer(self):
|
||||
from app.routers.ws_asr import _merge_stash
|
||||
|
||||
assert _merge_stash("你好", "") == "你好"
|
||||
assert _merge_stash("", "") == ""
|
||||
|
||||
def test_merge_whitespace_only_stash_preserves_buffer(self):
|
||||
from app.routers.ws_asr import _merge_stash
|
||||
|
||||
assert _merge_stash("你好", " ") == "你好"
|
||||
|
||||
|
||||
class TestProxyFormatsTranscriptionTextEvent:
|
||||
def test_partial_event_format(self):
|
||||
"""Partial transcription event should format as ASRTranscriptEvent with is_final=False."""
|
||||
def test_partial_event_returns_stash_field(self):
|
||||
"""Partial event returns raw stash for caller to merge."""
|
||||
from app.routers.ws_asr import format_transcription_event
|
||||
|
||||
event = {
|
||||
|
|
@ -85,17 +123,16 @@ class TestProxyFormatsTranscriptionTextEvent:
|
|||
"stash": "你好",
|
||||
"language": "yue",
|
||||
}
|
||||
accumulated = ""
|
||||
|
||||
result = format_transcription_event(event, accumulated)
|
||||
result = format_transcription_event(event, "")
|
||||
assert result is not None
|
||||
assert result["is_final"] is False
|
||||
assert result["language"] == "yue"
|
||||
assert result["delta"] == ""
|
||||
assert "你好" in result["full_text"]
|
||||
assert result["stash"] == "你好"
|
||||
|
||||
def test_partial_with_accumulated(self):
|
||||
"""Partial event should combine accumulated + current stash."""
|
||||
def test_partial_event_ignores_accumulated(self):
|
||||
"""Partial event returns stash unchanged regardless of accumulated."""
|
||||
from app.routers.ws_asr import format_transcription_event
|
||||
|
||||
event = {
|
||||
|
|
@ -103,11 +140,9 @@ class TestProxyFormatsTranscriptionTextEvent:
|
|||
"stash": "世界",
|
||||
"language": "yue",
|
||||
}
|
||||
accumulated = "你好"
|
||||
|
||||
result = format_transcription_event(event, accumulated)
|
||||
assert "你好" in result["full_text"]
|
||||
assert "世界" in result["full_text"]
|
||||
result = format_transcription_event(event, "你好")
|
||||
assert result["stash"] == "世界"
|
||||
|
||||
|
||||
class TestProxyFormatsTranscriptionCompletedEvent:
|
||||
|
|
@ -120,16 +155,15 @@ class TestProxyFormatsTranscriptionCompletedEvent:
|
|||
"transcript": "你好世界",
|
||||
"language": "yue",
|
||||
}
|
||||
accumulated = ""
|
||||
|
||||
result = format_transcription_event(event, accumulated)
|
||||
result = format_transcription_event(event, "")
|
||||
assert result is not None
|
||||
assert result["is_final"] is True
|
||||
assert result["language"] == "yue"
|
||||
assert "你好" in result["full_text"]
|
||||
|
||||
def test_completed_updates_accumulated(self):
|
||||
"""Completed event should return updated accumulated text."""
|
||||
"""Completed event appends transcript to accumulated text."""
|
||||
from app.routers.ws_asr import format_transcription_event
|
||||
|
||||
event = {
|
||||
|
|
@ -137,9 +171,8 @@ class TestProxyFormatsTranscriptionCompletedEvent:
|
|||
"transcript": "世界",
|
||||
"language": "yue",
|
||||
}
|
||||
accumulated = "你好"
|
||||
|
||||
result = format_transcription_event(event, accumulated)
|
||||
result = format_transcription_event(event, "你好")
|
||||
assert "你好" in result["full_text"]
|
||||
assert "世界" in result["full_text"]
|
||||
|
||||
|
|
@ -147,6 +180,5 @@ class TestProxyFormatsTranscriptionCompletedEvent:
|
|||
"""Unknown event types should return None."""
|
||||
from app.routers.ws_asr import format_transcription_event
|
||||
|
||||
event = {"type": "unknown.event"}
|
||||
result = format_transcription_event(event, "")
|
||||
result = format_transcription_event({"type": "unknown.event"}, "")
|
||||
assert result is None
|
||||
|
|
|
|||
|
|
@ -53,6 +53,7 @@ export const VideoPlayer = forwardRef<HTMLVideoElement, VideoPlayerProps>(({ src
|
|||
data-testid="video-player"
|
||||
src={src}
|
||||
controls
|
||||
crossOrigin="anonymous"
|
||||
className="w-full max-h-60 rounded-lg bg-black"
|
||||
onLoadStart={handleLoadStart}
|
||||
onCanPlay={handleCanPlay}
|
||||
|
|
|
|||
|
|
@ -24,6 +24,10 @@ export function useVideoASR({
|
|||
const processorRef = useRef<ScriptProcessorNode | null>(null)
|
||||
const sourceRef = useRef<MediaElementAudioSourceNode | null>(null)
|
||||
const isStreamingRef = useRef(false)
|
||||
const graphSetupRef = useRef(false)
|
||||
const transcriptRef = useRef('')
|
||||
const onFinalTranscriptRef = useRef(onFinalTranscript)
|
||||
onFinalTranscriptRef.current = onFinalTranscript
|
||||
|
||||
const getWSURL = useCallback(() => {
|
||||
const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'
|
||||
|
|
@ -33,20 +37,7 @@ export function useVideoASR({
|
|||
return `${protocol}//${backendHost}/ws/asr/${videoId}${langParam}`
|
||||
}, [videoId, language])
|
||||
|
||||
const startStreaming = useCallback(() => {
|
||||
if (!videoElement) return
|
||||
try {
|
||||
setStatus('connecting')
|
||||
|
||||
const audioContext = new AudioContext({ sampleRate: 16000 })
|
||||
audioContextRef.current = audioContext
|
||||
|
||||
const source = audioContext.createMediaElementSource(videoElement)
|
||||
sourceRef.current = source
|
||||
|
||||
const processor = audioContext.createScriptProcessor(4096, 1, 1)
|
||||
processorRef.current = processor
|
||||
|
||||
const connectWebSocket = useCallback(() => {
|
||||
const ws = new WebSocket(getWSURL())
|
||||
wsRef.current = ws
|
||||
|
||||
|
|
@ -58,48 +49,83 @@ export function useVideoASR({
|
|||
|
||||
ws.onmessage = (e) => {
|
||||
const msg: ASRMessage = JSON.parse(e.data)
|
||||
transcriptRef.current = msg.full_text
|
||||
setTranscript(msg.full_text)
|
||||
setPartialTranscript(msg.is_final ? '' : msg.full_text)
|
||||
if (msg.is_final && msg.full_text.trim()) {
|
||||
onFinalTranscript?.(msg.full_text)
|
||||
onFinalTranscriptRef.current?.(msg.full_text)
|
||||
}
|
||||
}
|
||||
|
||||
ws.onerror = () => setStatus('error')
|
||||
ws.onerror = (e) => {
|
||||
console.error('[useVideoASR] WebSocket error:', e)
|
||||
setStatus('error')
|
||||
}
|
||||
ws.onclose = () => {
|
||||
isStreamingRef.current = false
|
||||
setIsStreaming(false)
|
||||
setStatus('disconnected')
|
||||
}
|
||||
}, [getWSURL])
|
||||
|
||||
const closeWebSocket = useCallback(() => {
|
||||
wsRef.current?.close()
|
||||
wsRef.current = null
|
||||
}, [])
|
||||
|
||||
const startStreaming = useCallback(() => {
|
||||
if (!videoElement) return
|
||||
try {
|
||||
setStatus('connecting')
|
||||
audioContextRef.current?.resume()
|
||||
closeWebSocket()
|
||||
connectWebSocket()
|
||||
} catch (err) {
|
||||
console.error('[useVideoASR] startStreaming failed:', err)
|
||||
setStatus('error')
|
||||
}
|
||||
}, [videoElement, closeWebSocket, connectWebSocket])
|
||||
|
||||
const stopStreaming = useCallback(() => {
|
||||
isStreamingRef.current = false
|
||||
setIsStreaming(false)
|
||||
closeWebSocket()
|
||||
setStatus('idle')
|
||||
const currentText = transcriptRef.current.trim()
|
||||
if (currentText) {
|
||||
onFinalTranscriptRef.current?.(currentText)
|
||||
setPartialTranscript('')
|
||||
}
|
||||
}, [closeWebSocket])
|
||||
|
||||
useEffect(() => {
|
||||
if (!videoElement || graphSetupRef.current) return
|
||||
try {
|
||||
const audioContext = new AudioContext({ sampleRate: 16000 })
|
||||
audioContextRef.current = audioContext
|
||||
|
||||
const source = audioContext.createMediaElementSource(videoElement)
|
||||
sourceRef.current = source
|
||||
|
||||
const processor = audioContext.createScriptProcessor(4096, 1, 1)
|
||||
processorRef.current = processor
|
||||
|
||||
processor.onaudioprocess = (e) => {
|
||||
const float32Data = e.inputBuffer.getChannelData(0)
|
||||
const outputData = e.outputBuffer.getChannelData(0)
|
||||
outputData.set(float32Data)
|
||||
if (!isStreamingRef.current) return
|
||||
if (!wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) return
|
||||
const float32Data = e.inputBuffer.getChannelData(0)
|
||||
wsRef.current.send(float32Data.buffer)
|
||||
}
|
||||
|
||||
source.connect(processor)
|
||||
processor.connect(audioContext.destination)
|
||||
} catch {
|
||||
setStatus('error')
|
||||
graphSetupRef.current = true
|
||||
} catch (err) {
|
||||
console.error('[useVideoASR] audio graph setup failed:', err)
|
||||
}
|
||||
}, [videoElement, getWSURL, onFinalTranscript])
|
||||
|
||||
const stopStreaming = useCallback(() => {
|
||||
isStreamingRef.current = false
|
||||
setIsStreaming(false)
|
||||
processorRef.current?.disconnect()
|
||||
processorRef.current = null
|
||||
sourceRef.current?.disconnect()
|
||||
sourceRef.current = null
|
||||
wsRef.current?.close()
|
||||
wsRef.current = null
|
||||
audioContextRef.current?.close()
|
||||
audioContextRef.current = null
|
||||
setStatus('idle')
|
||||
setPartialTranscript('')
|
||||
}, [])
|
||||
}, [videoElement])
|
||||
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
|
|
|
|||
Loading…
Reference in New Issue