diff --git a/.plans/phase2_enhancement_delta_sse.md b/.plans/phase2_enhancement_delta_sse.md new file mode 100644 index 0000000..8b8e893 --- /dev/null +++ b/.plans/phase2_enhancement_delta_sse.md @@ -0,0 +1,186 @@ +# Phase 2 Enhancement: Delta-based transcript via existing WebSocket + +**Created:** 2026-05-07 +**Status:** Complete +**Depends on:** Phase 2 (Complete) + +--- + +## 1. Goal + +Replace full-text WebSocket responses with **delta-only updates**. Backend computes the difference between consecutive `text` fields and sends only new characters. Frontend appends them client-side. All over the existing WebSocket. + +## 2. Why `text` field over `stash` + +Stash log analysis revealed DashScope partial events have TWO text fields: + +| Field | Behavior | Description | +|-------|----------|-------------| +| `stash` | Sliding window, ~7-20 chars | Latest uncommitted characters | +| `text` | **Monotonically growing** (within one `item_id`) | Stable committed transcription | + +`text` is the stable, cumulative transcription. It grows monotonically within each utterance (`item_id`). Delta = `text[len(prev_text):]` — simple suffix diff, no merge logic needed. + +`stash` is still useful: it contains trailing chars not yet committed to `text`. Sent alongside deltas so the frontend can append it on pause (completing the last sentence). + +## 3. Architecture + +``` +DashScope partial event: + {"text": "多謝主席咁啊", "stash": "咁啊亦都", "item_id": "item_ABC"} + +Backend computes: + delta = text[len(prev_text):] → "咁啊" (new chars only) + sends: {"delta":"咁啊", "stash":"咁啊亦都", "is_final":false} + +Frontend: + transcriptRef += msg.delta → accumulates locally + lastStashRef = msg.stash → stores for pause handler + +On pause: + if stash not already at end of text: + text += stash → "...可以诶处理埋诶呢啲余。" + onFinalTranscript(text) → persists in QueryInput +``` + +## 4. Key Design Decisions + +### 4.1 Stash-only events skipped (text empty) + +Early in each utterance, `text` is `""` while `stash` has sliding window content. These are skipped — delta can't be computed from a sliding window. Once `text` starts populating (after ~1-2s), deltas stream. + +### 4.2 Utterance boundary tracking + +DashScope splits speech into `item_id` segments. When `item_id` changes, `text` resets to `""`. On detection: +- `prev_display` set to `" "` (space) so the next utterance's full text is sent as delta, prepended with a space +- Result: `"...上一句 融資安排方面..."` — utterances flow continuously + +### 4.3 Trailing stash on pause + +The `text` field lags behind — final chars of each utterance are only in `stash`. Every WS message includes the current `stash`. Frontend stores it. On pause, appends stash to text if not already overlapping. + +## 5. Backend Implementation (`ws_asr.py`) + +```python +prev_display = "" +current_item_id = "" + +async def read_events(): + nonlocal accumulated_text, prev_display, current_item_id + while True: + event = await event_queue.get() + result = format_transcription_event(event, accumulated_text) + if result is None: + continue + if result["is_final"]: + # Completed utterance: send full accumulated text + transcript = event.get("transcript", "") + if transcript and transcript.strip(): + accumulated_text = build_display_text(accumulated_text, transcript) + prev_display = "" + result["delta"] = "" + result["full_text"] = _to_traditional(accumulated_text) + else: + text = result.pop("text", "") + stash = result.pop("stash", "") + + # Utterance boundary: item_id change resets text to empty + item_id = event.get("item_id", "") + if item_id and item_id != current_item_id: + if prev_display: + prev_display = " " # prepend space for next utterance + current_item_id = item_id + + # text is monotonically growing within one utterance + if text.strip(): + new_delta = "" + if text != prev_display: + if prev_display and text.startswith(prev_display): + new_delta = text[len(prev_display):] + else: + new_delta = text + prev_display = text + result["delta"] = _to_traditional(new_delta) if new_delta else "" + result["full_text"] = "" + result["stash"] = _to_traditional(stash) if stash.strip() else "" + else: + # text empty (new utterance starting) — skip + continue + await client_ws.send_json(result) +``` + +## 6. Frontend Implementation (`useVideoASR.ts`) + +```typescript +const transcriptRef = useRef('') +const lastStashRef = useRef('') + +ws.onmessage = (e) => { + const msg = JSON.parse(e.data) + if (msg.is_final && msg.full_text) { + transcriptRef.current = msg.full_text + lastStashRef.current = '' + setTranscript(msg.full_text) + setPartialTranscript('') + onFinalTranscriptRef.current?.(msg.full_text) + } else if (msg.delta) { + transcriptRef.current += msg.delta + lastStashRef.current = msg.stash || '' + setTranscript(transcriptRef.current) + setPartialTranscript(transcriptRef.current) + } +} + +const stopStreaming = useCallback(() => { + closeWebSocket() + let text = transcriptRef.current.trim() + const stash = lastStashRef.current.trim() + if (stash && !text.endsWith(stash)) { + text += stash + } + lastStashRef.current = '' + if (text) { + onFinalTranscriptRef.current?.(text) + setPartialTranscript('') + } +}, [closeWebSocket]) +``` + +## 7. WebSocket Message Format + +```json +// Partial event (new chars only, with stash): +{"delta":"咁啊", "stash":"咁啊亦都", "full_text":"", "language":"yue", "is_final":false} + +// Completed event (full accumulated text): +{"delta":"", "stash":"", "full_text":"多謝主席咁啊亦都...", "language":"yue", "is_final":true} +``` + +## 8. Files Changed + +| File | Change | +|------|--------| +| `backend/app/routers/ws_asr.py` | Delta computation from `text` field, `item_id` tracking, `stash` passthrough | +| `backend/app/test/test_phase2_ws_protocol.py` | Updated partial event tests for `text`/`stash` fields | +| `frontend/src/hooks/useVideoASR.ts` | `lastStashRef`, delta accumulation, stash append on pause | +| `frontend/src/types/index.ts` | Added `stash?: string` to `ASRMessage` | + +## 9. Edge Cases Handled + +| Case | Handling | +|------|----------| +| `text` empty (new utterance) | Skip — delta not computable from sliding stash | +| `item_id` changes | Reset `prev_display`, prepend space to next utterance | +| Trailing stash not in `text` | Frontend appends on pause via `lastStashRef` | +| `text` doesn't start with `prev_display` | Send entire `text` as delta (utterance boundary) | +| Multiple completed events | Each resets `prev_display` and `lastStash` | +| Play again after pause | New WS session, all state reset | + +## 10. Lessons Learned + +- **`text` is the right field** — monotonically growing, simple delta = suffix diff +- **`stash` is a sliding window** — can't be accumulated, can't be delta-diffed +- **`item_id` matters** — DashScope splits speech into items; text resets on item change +- **`_merge_stash` not needed** — `text` is already cumulative; stash only needed for trailing chars +- **`replace: true` destroys cross-utterance text** — simpler to just append with space separator +- **Stash on pause completes sentences** — `text` lags ~200ms behind; stash fills the gap diff --git a/.plans/phase2_enhancement_use_text_field.md b/.plans/phase2_enhancement_use_text_field.md new file mode 100644 index 0000000..8ffa3bc --- /dev/null +++ b/.plans/phase2_enhancement_use_text_field.md @@ -0,0 +1,119 @@ +# Phase 2 Enhancement: Use `text` field instead of `stash` + +**Created:** 2026-05-07 +**Status:** Planning +**Depends on:** Phase 2 (Complete) + +--- + +## 1. Discovery + +Stash log analysis revealed DashScope partial events contain TWO text fields: + +| Field | Behavior | Description | +|-------|----------|-------------| +| `stash` | Sliding window, ~7-20 chars, replaces on each event | Latest characters recognized (raw ASR output) | +| `text` | Monotonically growing, **never shrinks** | Stable cumulative transcription of current utterance | + +``` +Event sequence example: + stash="多" text="" ← text empty early on + stash="多谢" text="" + stash="多谢主席咁啊" text="" + stash="主席咁啊,亦" text="" ← stash slides, text still empty + stash="咁啊,亦都多谢" text="多谢主席咁啊亦" ← text starts populating + stash="都多谢邱主任" text="多谢主席咁啊亦都多谢邱主任头先..." + stash="point嘅详细介绍" text="多谢主席咁啊亦都多谢邱主任头先个诶powerpoint嘅详细介绍..." +``` + +`text` grows monotonically — it's the stable transcription. `stash` slides as new audio arrives. + +## 2. Why This Is Better + +- **No `_merge_stash` needed** — `text` is already cumulative per utterance +- **No overlap detection** — characters never change once set +- **No risk of wrong merging** — stashes sometimes overlap incorrectly (sliding window may lose context) +- **Simpler code** — less logic, less surface area for bugs + +## 3. Changes Required + +### 3.1 Backend: `format_transcription_event` (ws_asr.py) + +```python +# BEFORE: extract stash field +if event_type == "...transcription.text": + stash = event.get("stash", "") + return {"delta": "", "stash": stash, ...} + +# AFTER: extract text field +if event_type == "...transcription.text": + text = event.get("text", "") + return {"delta": "", "text": text, ...} +``` + +### 3.2 Backend: `read_events` (ws_asr.py) + +```python +# BEFORE: merge stashes +else: + stash = result.pop("stash", "") + if stash.strip(): + partial_buffer = _merge_stash(partial_buffer, stash) + display = build_display_text(accumulated_text, partial_buffer) + +# AFTER: use text directly (already cumulative) +else: + text = result.pop("text", "") + if text.strip(): + partial_buffer = text # text already cumulative + display = build_display_text(accumulated_text, partial_buffer) +``` + +### 3.3 Backend: Remove `_merge_stash` function + +No longer needed. + +### 3.4 Backend: Tests (`test_phase2_ws_protocol.py`) + +- Replace `TestMergeStash` class with `TestTextFieldFormatting` +- Update partial event tests to use `text` field instead of `stash` +- Verify monotonic growth (text never shrinks character-by-character) + +### 3.5 Backend: Stash log format + +Update log to capture both fields for future debugging: + +```python +_stash_logger.info( + "seq=%d elapsed_ms=%d stash_len=%d text_len=%d stash=%r text=%r ...", + ... +) +``` + +## 4. What Does NOT Change + +- Frontend (`useVideoASR.ts`) — already handles `full_text` correctly, no changes needed +- Frontend (`QueryInput.tsx`) — unchanged +- Pause/stop logic — unchanged +- Completed event handling — unchanged (completed events already use `transcript` field) +- `partial_buffer` variable — still used, just populated from `text` instead of merged stashes + +## 5. Files Changed + +| File | Change | +|------|--------| +| `backend/app/routers/ws_asr.py` | Remove `_merge_stash()`, use `text` field, update stash logging | +| `backend/app/test/test_phase2_ws_protocol.py` | Replace merge tests with text-field tests | + +## 6. Acceptance Criteria + +- [ ] `text` field used for partial events instead of `stash` +- [ ] `_merge_stash` function removed +- [ ] Text displayed in QueryInput grows monotonically (no jumping/replacing) +- [ ] All 16 ws_protocol tests pass (updated) +- [ ] Text persists on pause (existing behavior, unchanged) +- [ ] Stash log captures both `stash` and `text` fields for reference + +## 7. Rollback Risk + +Low. Only 2 files changed, only backend. Frontend untouched. If `text` field behaves unexpectedly, revert to `_merge_stash` approach (already committed). diff --git a/backend/app/routers/ws_asr.py b/backend/app/routers/ws_asr.py index c0a92e4..9c04ddd 100644 --- a/backend/app/routers/ws_asr.py +++ b/backend/app/routers/ws_asr.py @@ -10,6 +10,14 @@ from app.core.config import get_settings from app.services.asr_client import float32_to_s16le, build_display_text, _to_traditional logger = logging.getLogger(__name__) + +_stash_logger = logging.getLogger("ws_asr.stash") +_stash_logger.propagate = False +_stash_handler = logging.FileHandler("app/log/stash.log") +_stash_handler.setFormatter(logging.Formatter("%(asctime)s %(message)s")) +_stash_logger.addHandler(_stash_handler) +_stash_logger.setLevel(logging.DEBUG) + router = APIRouter(tags=["asr"]) try: @@ -63,9 +71,11 @@ def format_transcription_event(event: dict, accumulated: str) -> dict | None: event_type = event.get("type", "") if event_type == "conversation.item.input_audio_transcription.text": + text = event.get("text", "") stash = event.get("stash", "") return { "delta": "", + "text": text, "stash": stash, "language": event.get("language", "yue"), "is_final": False, @@ -114,11 +124,13 @@ async def _ws_proxy_dashscope(client_ws: WebSocket, loop: asyncio.AbstractEventL logger.info("dashscope-session-updated lang=%s", language) accumulated_text = "" - partial_buffer = "" + prev_display = "" + current_item_id = "" chunk_count = 0 + stash_seq = 0 async def read_events(): - nonlocal accumulated_text, partial_buffer + nonlocal accumulated_text, prev_display, current_item_id, stash_seq while True: event = await event_queue.get() result = format_transcription_event(event, accumulated_text) @@ -128,16 +140,48 @@ async def _ws_proxy_dashscope(client_ws: WebSocket, loop: asyncio.AbstractEventL transcript = event.get("transcript", "") if transcript and transcript.strip(): accumulated_text = build_display_text(accumulated_text, transcript) - partial_buffer = "" + prev_display = "" + result["delta"] = "" result["full_text"] = _to_traditional(accumulated_text) logger.info("dashscope-utterance-completed text_len=%d lang=%s", len(accumulated_text), result.get("language", "yue")) else: + text = result.pop("text", "") stash = result.pop("stash", "") - if stash.strip(): - partial_buffer = _merge_stash(partial_buffer, stash) - display = build_display_text(accumulated_text, partial_buffer) - result["full_text"] = _to_traditional(display) - await client_ws.send_json(result) + elapsed_ms = int((time.monotonic() - session_start) * 1000) + stash_seq += 1 + _stash_logger.info( + "seq=%d elapsed_ms=%d stash_len=%d text_len=%d stash=%r text=%r lang=%s event=%s", + stash_seq, + elapsed_ms, + len(stash), + len(text), + stash, + text, + result.get("language", "?"), + json.dumps(event, ensure_ascii=False), + ) + # New utterance: item_id changes, text resets to empty + item_id = event.get("item_id", "") + if item_id and item_id != current_item_id: + if prev_display: + prev_display = " " # prepend space for next utterance + current_item_id = item_id + # text is monotonically growing within one utterance + if text.strip(): + new_delta = "" + if text != prev_display: + if prev_display and text.startswith(prev_display): + new_delta = text[len(prev_display):] + else: + new_delta = text + prev_display = text + result["delta"] = _to_traditional(new_delta) if new_delta else "" + result["full_text"] = "" + result["stash"] = _to_traditional(stash) if stash.strip() else "" + else: + continue + if result["delta"] or result["is_final"]: + await client_ws.send_json(result) read_task = asyncio.create_task(read_events()) diff --git a/backend/app/test/test_phase2_ws_protocol.py b/backend/app/test/test_phase2_ws_protocol.py index 4495636..9cdc64b 100644 --- a/backend/app/test/test_phase2_ws_protocol.py +++ b/backend/app/test/test_phase2_ws_protocol.py @@ -114,13 +114,14 @@ class TestMergeStash: class TestProxyFormatsTranscriptionTextEvent: - def test_partial_event_returns_stash_field(self): - """Partial event returns raw stash for caller to merge.""" + def test_partial_event_returns_text_and_stash_fields(self): + """Partial event returns both text (stable prefix) and stash (trailing).""" from app.routers.ws_asr import format_transcription_event event = { "type": "conversation.item.input_audio_transcription.text", - "stash": "你好", + "text": "多謝主席", + "stash": "席咁啊", "language": "yue", } @@ -129,20 +130,23 @@ class TestProxyFormatsTranscriptionTextEvent: assert result["is_final"] is False assert result["language"] == "yue" assert result["delta"] == "" - assert result["stash"] == "你好" + assert result["text"] == "多謝主席" + assert result["stash"] == "席咁啊" def test_partial_event_ignores_accumulated(self): - """Partial event returns stash unchanged regardless of accumulated.""" + """Partial event returns fields unchanged regardless of accumulated.""" from app.routers.ws_asr import format_transcription_event event = { "type": "conversation.item.input_audio_transcription.text", - "stash": "世界", + "text": "世界", + "stash": "界大同", "language": "yue", } result = format_transcription_event(event, "你好") - assert result["stash"] == "世界" + assert result["text"] == "世界" + assert result["stash"] == "界大同" class TestProxyFormatsTranscriptionCompletedEvent: diff --git a/frontend/src/hooks/useVideoASR.ts b/frontend/src/hooks/useVideoASR.ts index 52f8cc4..1930a5a 100644 --- a/frontend/src/hooks/useVideoASR.ts +++ b/frontend/src/hooks/useVideoASR.ts @@ -26,6 +26,7 @@ export function useVideoASR({ const isStreamingRef = useRef(false) const graphSetupRef = useRef(false) const transcriptRef = useRef('') + const lastStashRef = useRef('') const onFinalTranscriptRef = useRef(onFinalTranscript) onFinalTranscriptRef.current = onFinalTranscript @@ -49,11 +50,17 @@ export function useVideoASR({ ws.onmessage = (e) => { const msg: ASRMessage = JSON.parse(e.data) - transcriptRef.current = msg.full_text - setTranscript(msg.full_text) - setPartialTranscript(msg.is_final ? '' : msg.full_text) - if (msg.is_final && msg.full_text.trim()) { + if (msg.is_final && msg.full_text) { + transcriptRef.current = msg.full_text + lastStashRef.current = '' + setTranscript(msg.full_text) + setPartialTranscript('') onFinalTranscriptRef.current?.(msg.full_text) + } else if (msg.delta) { + transcriptRef.current += msg.delta + lastStashRef.current = (msg as any).stash || '' + setTranscript(transcriptRef.current) + setPartialTranscript(transcriptRef.current) } } @@ -91,7 +98,13 @@ export function useVideoASR({ setIsStreaming(false) closeWebSocket() setStatus('idle') - const currentText = transcriptRef.current.trim() + let currentText = transcriptRef.current.trim() + const stash = lastStashRef.current.trim() + if (stash && !currentText.endsWith(stash)) { + currentText += stash + transcriptRef.current = currentText + } + lastStashRef.current = '' if (currentText) { onFinalTranscriptRef.current?.(currentText) setPartialTranscript('') diff --git a/frontend/src/types/index.ts b/frontend/src/types/index.ts index 350a8f4..527eb17 100644 --- a/frontend/src/types/index.ts +++ b/frontend/src/types/index.ts @@ -178,6 +178,7 @@ export interface ASRMessage { full_text: string language: string is_final: boolean + stash?: string } export type ASRStatus = 'idle' | 'connecting' | 'streaming' | 'disconnected' | 'error'