diff --git a/.plans/phase2_enhancement_delta_sse.md b/.plans/phase2_enhancement_delta_sse.md
new file mode 100644
index 0000000..8b8e893
--- /dev/null
+++ b/.plans/phase2_enhancement_delta_sse.md
@@ -0,0 +1,186 @@
+# Phase 2 Enhancement: Delta-based transcript via existing WebSocket
+
+**Created:** 2026-05-07
+**Status:** Complete
+**Depends on:** Phase 2 (Complete)
+
+---
+
+## 1. Goal
+
+Replace full-text WebSocket responses with **delta-only updates**. Backend computes the difference between consecutive `text` fields and sends only new characters. Frontend appends them client-side. All over the existing WebSocket.
+
+## 2. Why `text` field over `stash`
+
+Stash log analysis revealed DashScope partial events have TWO text fields:
+
+| Field | Behavior | Description |
+|-------|----------|-------------|
+| `stash` | Sliding window, ~7-20 chars | Latest uncommitted characters |
+| `text` | **Monotonically growing** (within one `item_id`) | Stable committed transcription |
+
+`text` is the stable, cumulative transcription. It grows monotonically within each utterance (`item_id`). Delta = `text[len(prev_text):]` — simple suffix diff, no merge logic needed.
+
+`stash` is still useful: it contains trailing chars not yet committed to `text`. Sent alongside deltas so the frontend can append it on pause (completing the last sentence).
+
+## 3. Architecture
+
+```
+DashScope partial event:
+  {"text": "多謝主席咁啊", "stash": "咁啊亦都", "item_id": "item_ABC"}
+
+Backend computes:
+  delta = text[len(prev_text):]       → "咁啊" (new chars only)
+  sends: {"delta":"咁啊", "stash":"咁啊亦都", "is_final":false}
+
+Frontend:
+  transcriptRef += msg.delta          → accumulates locally
+  lastStashRef = msg.stash            → stores for pause handler
+
+On pause:
+  if stash not already at end of text:
+    text += stash                     → "...可以诶处理埋诶呢啲余。"
+  onFinalTranscript(text)             → persists in QueryInput
+```
+
+## 4. Key Design Decisions
+
+### 4.1 Stash-only events skipped (text empty)
+
+Early in each utterance, `text` is `""` while `stash` has sliding window content. These are skipped — delta can't be computed from a sliding window. Once `text` starts populating (after ~1-2s), deltas stream.
+
+### 4.2 Utterance boundary tracking
+
+DashScope splits speech into `item_id` segments. When `item_id` changes, `text` resets to `""`. On detection:
+- `prev_display` set to `" "` (space) so the next utterance's full text is sent as delta, prepended with a space
+- Result: `"...上一句 融資安排方面..."` — utterances flow continuously
+
+### 4.3 Trailing stash on pause
+
+The `text` field lags behind — final chars of each utterance are only in `stash`. Every WS message includes the current `stash`. Frontend stores it. On pause, appends stash to text if not already overlapping.
+
+## 5. Backend Implementation (`ws_asr.py`)
+
+```python
+prev_display = ""
+current_item_id = ""
+
+async def read_events():
+    nonlocal accumulated_text, prev_display, current_item_id
+    while True:
+        event = await event_queue.get()
+        result = format_transcription_event(event, accumulated_text)
+        if result is None:
+            continue
+        if result["is_final"]:
+            # Completed utterance: send full accumulated text
+            transcript = event.get("transcript", "")
+            if transcript and transcript.strip():
+                accumulated_text = build_display_text(accumulated_text, transcript)
+            prev_display = ""
+            result["delta"] = ""
+            result["full_text"] = _to_traditional(accumulated_text)
+        else:
+            text = result.pop("text", "")
+            stash = result.pop("stash", "")
+
+            # Utterance boundary: item_id change resets text to empty
+            item_id = event.get("item_id", "")
+            if item_id and item_id != current_item_id:
+                if prev_display:
+                    prev_display = " "  # prepend space for next utterance
+                current_item_id = item_id
+
+            # text is monotonically growing within one utterance
+            if text.strip():
+                new_delta = ""
+                if text != prev_display:
+                    if prev_display and text.startswith(prev_display):
+                        new_delta = text[len(prev_display):]
+                    else:
+                        new_delta = text
+                    prev_display = text
+                result["delta"] = _to_traditional(new_delta) if new_delta else ""
+                result["full_text"] = ""
+                result["stash"] = _to_traditional(stash) if stash.strip() else ""
+            else:
+                # text empty (new utterance starting) — skip
+                continue
+        await client_ws.send_json(result)
+```
+
+## 6. Frontend Implementation (`useVideoASR.ts`)
+
+```typescript
+const transcriptRef = useRef('')
+const lastStashRef = useRef('')
+
+ws.onmessage = (e) => {
+    const msg = JSON.parse(e.data)
+    if (msg.is_final && msg.full_text) {
+        transcriptRef.current = msg.full_text
+        lastStashRef.current = ''
+        setTranscript(msg.full_text)
+        setPartialTranscript('')
+        onFinalTranscriptRef.current?.(msg.full_text)
+    } else if (msg.delta) {
+        transcriptRef.current += msg.delta
+        lastStashRef.current = msg.stash || ''
+        setTranscript(transcriptRef.current)
+        setPartialTranscript(transcriptRef.current)
+    }
+}
+
+const stopStreaming = useCallback(() => {
+    closeWebSocket()
+    let text = transcriptRef.current.trim()
+    const stash = lastStashRef.current.trim()
+    if (stash && !text.endsWith(stash)) {
+        text += stash
+    }
+    lastStashRef.current = ''
+    if (text) {
+        onFinalTranscriptRef.current?.(text)
+        setPartialTranscript('')
+    }
+}, [closeWebSocket])
+```
+
+## 7. WebSocket Message Format
+
+```json
+// Partial event (new chars only, with stash):
+{"delta":"咁啊", "stash":"咁啊亦都", "full_text":"", "language":"yue", "is_final":false}
+
+// Completed event (full accumulated text):
+{"delta":"", "stash":"", "full_text":"多謝主席咁啊亦都...", "language":"yue", "is_final":true}
+```
+
+## 8. Files Changed
+
+| File | Change |
+|------|--------|
+| `backend/app/routers/ws_asr.py` | Delta computation from `text` field, `item_id` tracking, `stash` passthrough |
+| `backend/app/test/test_phase2_ws_protocol.py` | Updated partial event tests for `text`/`stash` fields |
+| `frontend/src/hooks/useVideoASR.ts` | `lastStashRef`, delta accumulation, stash append on pause |
+| `frontend/src/types/index.ts` | Added `stash?: string` to `ASRMessage` |
+
+## 9. Edge Cases Handled
+
+| Case | Handling |
+|------|----------|
+| `text` empty (new utterance) | Skip — delta not computable from sliding stash |
+| `item_id` changes | Reset `prev_display`, prepend space to next utterance |
+| Trailing stash not in `text` | Frontend appends on pause via `lastStashRef` |
+| `text` doesn't start with `prev_display` | Send entire `text` as delta (utterance boundary) |
+| Multiple completed events | Each resets `prev_display` and `lastStash` |
+| Play again after pause | New WS session, all state reset |
+
+## 10. Lessons Learned
+
+- **`text` is the right field** — monotonically growing, simple delta = suffix diff
+- **`stash` is a sliding window** — can't be accumulated, can't be delta-diffed
+- **`item_id` matters** — DashScope splits speech into items; text resets on item change
+- **`_merge_stash` not needed** — `text` is already cumulative; stash only needed for trailing chars
+- **`replace: true` destroys cross-utterance text** — simpler to just append with space separator
+- **Stash on pause completes sentences** — `text` lags ~200ms behind; stash fills the gap
diff --git a/.plans/phase2_enhancement_use_text_field.md b/.plans/phase2_enhancement_use_text_field.md
new file mode 100644
index 0000000..8ffa3bc
--- /dev/null
+++ b/.plans/phase2_enhancement_use_text_field.md
@@ -0,0 +1,119 @@
+# Phase 2 Enhancement: Use `text` field instead of `stash`
+
+**Created:** 2026-05-07
+**Status:** Planning
+**Depends on:** Phase 2 (Complete)
+
+---
+
+## 1. Discovery
+
+Stash log analysis revealed DashScope partial events contain TWO text fields:
+
+| Field | Behavior | Description |
+|-------|----------|-------------|
+| `stash` | Sliding window, ~7-20 chars, replaces on each event | Latest characters recognized (raw ASR output) |
+| `text` | Monotonically growing, **never shrinks** | Stable cumulative transcription of current utterance |
+
+```
+Event sequence example:
+  stash="多"               text=""             ← text empty early on
+  stash="多谢"              text=""
+  stash="多谢主席咁啊"        text=""
+  stash="主席咁啊，亦"        text=""             ← stash slides, text still empty
+  stash="咁啊，亦都多谢"      text="多谢主席咁啊亦"  ← text starts populating
+  stash="都多谢邱主任"        text="多谢主席咁啊亦都多谢邱主任头先..."
+  stash="point嘅详细介绍"    text="多谢主席咁啊亦都多谢邱主任头先个诶powerpoint嘅详细介绍..."
+```
+
+`text` grows monotonically — it's the stable transcription. `stash` slides as new audio arrives.
+
+## 2. Why This Is Better
+
+- **No `_merge_stash` needed** — `text` is already cumulative per utterance
+- **No overlap detection** — characters never change once set
+- **No risk of wrong merging** — stashes sometimes overlap incorrectly (sliding window may lose context)
+- **Simpler code** — less logic, less surface area for bugs
+
+## 3. Changes Required
+
+### 3.1 Backend: `format_transcription_event` (ws_asr.py)
+
+```python
+# BEFORE: extract stash field
+if event_type == "...transcription.text":
+    stash = event.get("stash", "")
+    return {"delta": "", "stash": stash, ...}
+
+# AFTER: extract text field
+if event_type == "...transcription.text":
+    text = event.get("text", "")
+    return {"delta": "", "text": text, ...}
+```
+
+### 3.2 Backend: `read_events` (ws_asr.py)
+
+```python
+# BEFORE: merge stashes
+else:
+    stash = result.pop("stash", "")
+    if stash.strip():
+        partial_buffer = _merge_stash(partial_buffer, stash)
+    display = build_display_text(accumulated_text, partial_buffer)
+
+# AFTER: use text directly (already cumulative)
+else:
+    text = result.pop("text", "")
+    if text.strip():
+        partial_buffer = text       # text already cumulative
+    display = build_display_text(accumulated_text, partial_buffer)
+```
+
+### 3.3 Backend: Remove `_merge_stash` function
+
+No longer needed.
+
+### 3.4 Backend: Tests (`test_phase2_ws_protocol.py`)
+
+- Replace `TestMergeStash` class with `TestTextFieldFormatting`
+- Update partial event tests to use `text` field instead of `stash`
+- Verify monotonic growth (text never shrinks character-by-character)
+
+### 3.5 Backend: Stash log format
+
+Update log to capture both fields for future debugging:
+
+```python
+_stash_logger.info(
+    "seq=%d elapsed_ms=%d stash_len=%d text_len=%d stash=%r text=%r ...",
+    ...
+)
+```
+
+## 4. What Does NOT Change
+
+- Frontend (`useVideoASR.ts`) — already handles `full_text` correctly, no changes needed
+- Frontend (`QueryInput.tsx`) — unchanged
+- Pause/stop logic — unchanged
+- Completed event handling — unchanged (completed events already use `transcript` field)
+- `partial_buffer` variable — still used, just populated from `text` instead of merged stashes
+
+## 5. Files Changed
+
+| File | Change |
+|------|--------|
+| `backend/app/routers/ws_asr.py` | Remove `_merge_stash()`, use `text` field, update stash logging |
+| `backend/app/test/test_phase2_ws_protocol.py` | Replace merge tests with text-field tests |
+
+## 6. Acceptance Criteria
+
+- [ ] `text` field used for partial events instead of `stash`
+- [ ] `_merge_stash` function removed
+- [ ] Text displayed in QueryInput grows monotonically (no jumping/replacing)
+- [ ] All 16 ws_protocol tests pass (updated)
+- [ ] Text persists on pause (existing behavior, unchanged)
+- [ ] Stash log captures both `stash` and `text` fields for reference
+
+## 7. Rollback Risk
+
+Low. Only 2 files changed, only backend. Frontend untouched. If `text` field behaves unexpectedly, revert to `_merge_stash` approach (already committed).
diff --git a/backend/app/routers/ws_asr.py b/backend/app/routers/ws_asr.py
index c0a92e4..9c04ddd 100644
--- a/backend/app/routers/ws_asr.py
+++ b/backend/app/routers/ws_asr.py
@@ -10,6 +10,14 @@ from app.core.config import get_settings
 from app.services.asr_client import float32_to_s16le, build_display_text, _to_traditional
 
 logger = logging.getLogger(__name__)
+
+_stash_logger = logging.getLogger("ws_asr.stash")
+_stash_logger.propagate = False
+_stash_handler = logging.FileHandler("app/log/stash.log")
+_stash_handler.setFormatter(logging.Formatter("%(asctime)s %(message)s"))
+_stash_logger.addHandler(_stash_handler)
+_stash_logger.setLevel(logging.DEBUG)
+
 router = APIRouter(tags=["asr"])
 
 try:
@@ -63,9 +71,11 @@ def format_transcription_event(event: dict, accumulated: str) -> dict | None:
     event_type = event.get("type", "")
 
     if event_type == "conversation.item.input_audio_transcription.text":
+        text = event.get("text", "")
         stash = event.get("stash", "")
         return {
             "delta": "",
+            "text": text,
             "stash": stash,
             "language": event.get("language", "yue"),
             "is_final": False,
@@ -114,11 +124,13 @@ async def _ws_proxy_dashscope(client_ws: WebSocket, loop: asyncio.AbstractEventL
     logger.info("dashscope-session-updated lang=%s", language)
 
     accumulated_text = ""
-    partial_buffer = ""
+    prev_display = ""
+    current_item_id = ""
     chunk_count = 0
+    stash_seq = 0
 
     async def read_events():
-        nonlocal accumulated_text, partial_buffer
+        nonlocal accumulated_text, prev_display, current_item_id, stash_seq
         while True:
             event = await event_queue.get()
             result = format_transcription_event(event, accumulated_text)
@@ -128,16 +140,48 @@ async def _ws_proxy_dashscope(client_ws: WebSocket, loop: asyncio.AbstractEventL
                 transcript = event.get("transcript", "")
                 if transcript and transcript.strip():
                     accumulated_text = build_display_text(accumulated_text, transcript)
-                partial_buffer = ""
+                prev_display = ""
+                result["delta"] = ""
                 result["full_text"] = _to_traditional(accumulated_text)
                 logger.info("dashscope-utterance-completed text_len=%d lang=%s", len(accumulated_text), result.get("language", "yue"))
             else:
+                text = result.pop("text", "")
                 stash = result.pop("stash", "")
-                if stash.strip():
-                    partial_buffer = _merge_stash(partial_buffer, stash)
-                display = build_display_text(accumulated_text, partial_buffer)
-                result["full_text"] = _to_traditional(display)
-            await client_ws.send_json(result)
+                elapsed_ms = int((time.monotonic() - session_start) * 1000)
+                stash_seq += 1
+                _stash_logger.info(
+                    "seq=%d elapsed_ms=%d stash_len=%d text_len=%d stash=%r text=%r lang=%s event=%s",
+                    stash_seq,
+                    elapsed_ms,
+                    len(stash),
+                    len(text),
+                    stash,
+                    text,
+                    result.get("language", "?"),
+                    json.dumps(event, ensure_ascii=False),
+                )
+                # New utterance: item_id changes, text resets to empty
+                item_id = event.get("item_id", "")
+                if item_id and item_id != current_item_id:
+                    if prev_display:
+                        prev_display = " "  # prepend space for next utterance
+                    current_item_id = item_id
+                # text is monotonically growing within one utterance
+                if text.strip():
+                    new_delta = ""
+                    if text != prev_display:
+                        if prev_display and text.startswith(prev_display):
+                            new_delta = text[len(prev_display):]
+                        else:
+                            new_delta = text
+                        prev_display = text
+                    result["delta"] = _to_traditional(new_delta) if new_delta else ""
+                    result["full_text"] = ""
+                    result["stash"] = _to_traditional(stash) if stash.strip() else ""
+                else:
+                    continue
+            if result["delta"] or result["is_final"]:
+                await client_ws.send_json(result)
 
     read_task = asyncio.create_task(read_events())
 
diff --git a/backend/app/test/test_phase2_ws_protocol.py b/backend/app/test/test_phase2_ws_protocol.py
index 4495636..9cdc64b 100644
--- a/backend/app/test/test_phase2_ws_protocol.py
+++ b/backend/app/test/test_phase2_ws_protocol.py
@@ -114,13 +114,14 @@ class TestMergeStash:
 
 
 class TestProxyFormatsTranscriptionTextEvent:
-    def test_partial_event_returns_stash_field(self):
-        """Partial event returns raw stash for caller to merge."""
+    def test_partial_event_returns_text_and_stash_fields(self):
+        """Partial event returns both text (stable prefix) and stash (trailing)."""
         from app.routers.ws_asr import format_transcription_event
 
         event = {
             "type": "conversation.item.input_audio_transcription.text",
-            "stash": "你好",
+            "text": "多謝主席",
+            "stash": "席咁啊",
             "language": "yue",
         }
 
@@ -129,20 +130,23 @@ class TestProxyFormatsTranscriptionTextEvent:
         assert result["is_final"] is False
         assert result["language"] == "yue"
         assert result["delta"] == ""
-        assert result["stash"] == "你好"
+        assert result["text"] == "多謝主席"
+        assert result["stash"] == "席咁啊"
 
     def test_partial_event_ignores_accumulated(self):
-        """Partial event returns stash unchanged regardless of accumulated."""
+        """Partial event returns fields unchanged regardless of accumulated."""
         from app.routers.ws_asr import format_transcription_event
 
         event = {
             "type": "conversation.item.input_audio_transcription.text",
-            "stash": "世界",
+            "text": "世界",
+            "stash": "界大同",
             "language": "yue",
         }
 
         result = format_transcription_event(event, "你好")
-        assert result["stash"] == "世界"
+        assert result["text"] == "世界"
+        assert result["stash"] == "界大同"
 
 
 class TestProxyFormatsTranscriptionCompletedEvent:
diff --git a/frontend/src/hooks/useVideoASR.ts b/frontend/src/hooks/useVideoASR.ts
index 52f8cc4..1930a5a 100644
--- a/frontend/src/hooks/useVideoASR.ts
+++ b/frontend/src/hooks/useVideoASR.ts
@@ -26,6 +26,7 @@ export function useVideoASR({
   const isStreamingRef = useRef(false)
   const graphSetupRef = useRef(false)
   const transcriptRef = useRef('')
+  const lastStashRef = useRef('')
   const onFinalTranscriptRef = useRef(onFinalTranscript)
   onFinalTranscriptRef.current = onFinalTranscript
 
@@ -49,11 +50,17 @@ export function useVideoASR({
 
     ws.onmessage = (e) => {
       const msg: ASRMessage = JSON.parse(e.data)
-      transcriptRef.current = msg.full_text
-      setTranscript(msg.full_text)
-      setPartialTranscript(msg.is_final ? '' : msg.full_text)
-      if (msg.is_final && msg.full_text.trim()) {
+      if (msg.is_final && msg.full_text) {
+        transcriptRef.current = msg.full_text
+        lastStashRef.current = ''
+        setTranscript(msg.full_text)
+        setPartialTranscript('')
         onFinalTranscriptRef.current?.(msg.full_text)
+      } else if (msg.delta) {
+        transcriptRef.current += msg.delta
+        lastStashRef.current = (msg as any).stash || ''
+        setTranscript(transcriptRef.current)
+        setPartialTranscript(transcriptRef.current)
       }
     }
 
@@ -91,7 +98,13 @@ export function useVideoASR({
     setIsStreaming(false)
     closeWebSocket()
     setStatus('idle')
-    const currentText = transcriptRef.current.trim()
+    let currentText = transcriptRef.current.trim()
+    const stash = lastStashRef.current.trim()
+    if (stash && !currentText.endsWith(stash)) {
+      currentText += stash
+      transcriptRef.current = currentText
+    }
+    lastStashRef.current = ''
     if (currentText) {
       onFinalTranscriptRef.current?.(currentText)
       setPartialTranscript('')
diff --git a/frontend/src/types/index.ts b/frontend/src/types/index.ts
index 350a8f4..527eb17 100644
--- a/frontend/src/types/index.ts
+++ b/frontend/src/types/index.ts
@@ -178,6 +178,7 @@ export interface ASRMessage {
   full_text: string
   language: string
   is_final: boolean
+  stash?: string
 }
 
 export type ASRStatus = 'idle' | 'connecting' | 'streaming' | 'disconnected' | 'error'