legco_ai_assistant/.examples/alibaba_asr_frontend_vanill...

160 lines
6.1 KiB
HTML

<!--
Reference: Browser Audio Capture + WebSocket Streaming for Alibaba Cloud ASR.
Extracted from: /mnt/c/Users/woody/Documents/projects/voice input/backend/static/index.html
Architecture:
Browser mic → AudioContext (16kHz mono) → ScriptProcessor → Float32Array
→ WebSocket.send(float32Data.buffer) → FastAPI → DashScope
Key points:
- Must use HTTPS/WSS (Chrome blocks getUserMedia on HTTP)
- AudioContext sampleRate MUST be 16000
- ScriptProcessor buffer size: 4096 (lower = more frequent sends, lower latency)
- Send raw Float32 bytes (NOT base64, NOT WAV) to our backend WebSocket
- Backend handles Float32 → S16_LE → base64 conversion
- Language query param: ?language=yue (Cantonese), zh, en, auto
-->
<!DOCTYPE html>
<html lang="zh-HK">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>ASR Reference - Audio Capture</title>
</head>
<body>
<h1>Alibaba Cloud ASR - Audio Capture Pattern</h1>
<!-- Language selector -->
<select id="langSelect">
<option value="auto">Auto-Detect</option>
<option value="en">English</option>
<option value="zh">Mandarin</option>
<option value="yue" selected>Cantonese</option>
</select>
<!-- Record toggle button -->
<button id="recordBtn">Start Recording</button>
<!-- Status indicator -->
<div id="status">Ready</div>
<!-- Transcript display -->
<div id="transcript"></div>
<script>
// ─── Configuration ──────────────────────────────────────────────────────────
const WS_PATH = '/ws/asr/session-1'; // video_id from URL or session
const WS_BASE = `${location.protocol === 'https:' ? 'wss:' : 'ws:'}//${location.host}${WS_PATH}`;
// ─── State ──────────────────────────────────────────────────────────────────
let ws = null;
let audioContext = null;
let processor = null; // ScriptProcessorNode
let stream = null; // MediaStream
let isRecording = false;
// ─── DOM Refs ───────────────────────────────────────────────────────────────
const recordBtn = document.getElementById('recordBtn');
const langSelect = document.getElementById('langSelect');
const statusEl = document.getElementById('status');
const transcriptEl = document.getElementById('transcript');
// ─── WebSocket URL Builder ──────────────────────────────────────────────────
function getWSURL() {
const lang = langSelect.value;
return lang !== 'auto' ? `${WS_BASE}?language=${lang}` : WS_BASE;
}
// ─── Status Helper ──────────────────────────────────────────────────────────
function setStatus(text) {
statusEl.textContent = text;
}
// ─── Audio Capture Setup ────────────────────────────────────────────────────
async function startRecording() {
try {
// Step 1: Get microphone access
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
// Step 2: Create AudioContext at EXACTLY 16kHz (required by DashScope)
audioContext = new AudioContext({ sampleRate: 16000 });
// Step 3: Create media stream source
const source = audioContext.createMediaStreamSource(stream);
// Step 4: Create ScriptProcessor for raw PCM access
// Buffer size 4096 → ~256ms chunks at 16kHz (lower = more responsive)
processor = audioContext.createScriptProcessor(4096, 1, 1);
// Step 5: Connect WebSocket
ws = new WebSocket(getWSURL());
ws.onopen = () => {
isRecording = true;
recordBtn.textContent = 'Stop Recording';
setStatus('Listening...');
};
ws.onmessage = (e) => {
const { full_text, language, is_final } = JSON.parse(e.data);
// Display transcript — in our React app, this goes into QueryInput
transcriptEl.textContent = full_text || '';
};
ws.onerror = () => setStatus('WebSocket error');
ws.onclose = () => {
isRecording = false;
recordBtn.textContent = 'Start Recording';
setStatus('Disconnected');
};
// Step 6: Audio processing callback — fires every ~256ms
processor.onaudioprocess = (e) => {
// Get raw Float32 samples from input channel 0
const float32Data = e.inputBuffer.getChannelData(0);
// Send raw Float32 bytes to backend (NOT base64, NOT WAV)
// The .buffer property gives us an ArrayBuffer of the Float32Array
if (ws && ws.readyState === WebSocket.OPEN && isRecording) {
ws.send(float32Data.buffer);
}
};
// Step 7: Connect the audio graph
source.connect(processor);
processor.connect(audioContext.destination); // Required: prevents auto-mute
} catch (err) {
setStatus('Microphone access denied');
console.error('Audio capture error:', err);
}
}
// ─── Stop Recording ─────────────────────────────────────────────────────────
function stopRecording() {
isRecording = false;
// Clean up audio graph
processor?.disconnect();
stream?.getTracks().forEach(t => t.stop());
// Close WebSocket
ws?.close();
recordBtn.textContent = 'Start Recording';
setStatus('Ready');
}
// ─── Toggle Recording ───────────────────────────────────────────────────────
recordBtn.addEventListener('click', () => {
if (isRecording) {
stopRecording();
} else {
startRecording();
}
});
</script>
</body>
</html>