legco_ai_assistant/.examples/alibaba_asr_frontend_vanill...

<!--
Reference: Browser Audio Capture + WebSocket Streaming for Alibaba Cloud ASR.

Extracted from: /mnt/c/Users/woody/Documents/projects/voice input/backend/static/index.html

Architecture:
  Browser mic → AudioContext (16kHz mono) → ScriptProcessor → Float32Array
  → WebSocket.send(float32Data.buffer) → FastAPI → DashScope

Key points:
  - Must use HTTPS/WSS (Chrome blocks getUserMedia on HTTP)
  - AudioContext sampleRate MUST be 16000
  - ScriptProcessor buffer size: 4096 (lower = more frequent sends, lower latency)
  - Send raw Float32 bytes (NOT base64, NOT WAV) to our backend WebSocket
  - Backend handles Float32 → S16_LE → base64 conversion
  - Language query param: ?language=yue (Cantonese), zh, en, auto
-->
<!DOCTYPE html>
<html lang="zh-HK">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>ASR Reference - Audio Capture</title>
</head>
<body>
  <h1>Alibaba Cloud ASR - Audio Capture Pattern</h1>

  <!-- Language selector -->
  <select id="langSelect">
    <option value="auto">Auto-Detect</option>
    <option value="en">English</option>
    <option value="zh">Mandarin</option>
    <option value="yue" selected>Cantonese</option>
  </select>

  <!-- Record toggle button -->
  <button id="recordBtn">Start Recording</button>

  <!-- Status indicator -->
  <div id="status">Ready</div>

  <!-- Transcript display -->
  <div id="transcript"></div>

<script>
// ─── Configuration ──────────────────────────────────────────────────────────
const WS_PATH = '/ws/asr/session-1';  // video_id from URL or session
const WS_BASE = `${location.protocol === 'https:' ? 'wss:' : 'ws:'}//${location.host}${WS_PATH}`;

// ─── State ──────────────────────────────────────────────────────────────────
let ws = null;
let audioContext = null;
let processor = null;       // ScriptProcessorNode
let stream = null;          // MediaStream
let isRecording = false;

// ─── DOM Refs ───────────────────────────────────────────────────────────────
const recordBtn = document.getElementById('recordBtn');
const langSelect = document.getElementById('langSelect');
const statusEl = document.getElementById('status');
const transcriptEl = document.getElementById('transcript');

// ─── WebSocket URL Builder ──────────────────────────────────────────────────
function getWSURL() {
  const lang = langSelect.value;
  return lang !== 'auto' ? `${WS_BASE}?language=${lang}` : WS_BASE;
}

// ─── Status Helper ──────────────────────────────────────────────────────────
function setStatus(text) {
  statusEl.textContent = text;
}

// ─── Audio Capture Setup ────────────────────────────────────────────────────
async function startRecording() {
  try {
    // Step 1: Get microphone access
    stream = await navigator.mediaDevices.getUserMedia({ audio: true });

    // Step 2: Create AudioContext at EXACTLY 16kHz (required by DashScope)
    audioContext = new AudioContext({ sampleRate: 16000 });

    // Step 3: Create media stream source
    const source = audioContext.createMediaStreamSource(stream);

    // Step 4: Create ScriptProcessor for raw PCM access
    // Buffer size 4096 → ~256ms chunks at 16kHz (lower = more responsive)
    processor = audioContext.createScriptProcessor(4096, 1, 1);

    // Step 5: Connect WebSocket
    ws = new WebSocket(getWSURL());

    ws.onopen = () => {
      isRecording = true;
      recordBtn.textContent = 'Stop Recording';
      setStatus('Listening...');
    };

    ws.onmessage = (e) => {
      const { full_text, language, is_final } = JSON.parse(e.data);
      // Display transcript — in our React app, this goes into QueryInput
      transcriptEl.textContent = full_text || '';
    };

    ws.onerror = () => setStatus('WebSocket error');
    ws.onclose = () => {
      isRecording = false;
      recordBtn.textContent = 'Start Recording';
      setStatus('Disconnected');
    };

    // Step 6: Audio processing callback — fires every ~256ms
    processor.onaudioprocess = (e) => {
      // Get raw Float32 samples from input channel 0
      const float32Data = e.inputBuffer.getChannelData(0);

      // Send raw Float32 bytes to backend (NOT base64, NOT WAV)
      // The .buffer property gives us an ArrayBuffer of the Float32Array
      if (ws && ws.readyState === WebSocket.OPEN && isRecording) {
        ws.send(float32Data.buffer);
      }
    };

    // Step 7: Connect the audio graph
    source.connect(processor);
    processor.connect(audioContext.destination);  // Required: prevents auto-mute

  } catch (err) {
    setStatus('Microphone access denied');
    console.error('Audio capture error:', err);
  }
}

// ─── Stop Recording ─────────────────────────────────────────────────────────
function stopRecording() {
  isRecording = false;

  // Clean up audio graph
  processor?.disconnect();
  stream?.getTracks().forEach(t => t.stop());

  // Close WebSocket
  ws?.close();

  recordBtn.textContent = 'Start Recording';
  setStatus('Ready');
}

// ─── Toggle Recording ───────────────────────────────────────────────────────
recordBtn.addEventListener('click', () => {
  if (isRecording) {
    stopRecording();
  } else {
    startRecording();
  }
});
</script>
</body>
</html>