160 lines
6.1 KiB
HTML
160 lines
6.1 KiB
HTML
<!--
|
|
Reference: Browser Audio Capture + WebSocket Streaming for Alibaba Cloud ASR.
|
|
|
|
Extracted from: /mnt/c/Users/woody/Documents/projects/voice input/backend/static/index.html
|
|
|
|
Architecture:
|
|
Browser mic → AudioContext (16kHz mono) → ScriptProcessor → Float32Array
|
|
→ WebSocket.send(float32Data.buffer) → FastAPI → DashScope
|
|
|
|
Key points:
|
|
- Must use HTTPS/WSS (Chrome blocks getUserMedia on HTTP)
|
|
- AudioContext sampleRate MUST be 16000
|
|
- ScriptProcessor buffer size: 4096 (lower = more frequent sends, lower latency)
|
|
- Send raw Float32 bytes (NOT base64, NOT WAV) to our backend WebSocket
|
|
- Backend handles Float32 → S16_LE → base64 conversion
|
|
- Language query param: ?language=yue (Cantonese), zh, en, auto
|
|
-->
|
|
<!DOCTYPE html>
|
|
<html lang="zh-HK">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title>ASR Reference - Audio Capture</title>
|
|
</head>
|
|
<body>
|
|
<h1>Alibaba Cloud ASR - Audio Capture Pattern</h1>
|
|
|
|
<!-- Language selector -->
|
|
<select id="langSelect">
|
|
<option value="auto">Auto-Detect</option>
|
|
<option value="en">English</option>
|
|
<option value="zh">Mandarin</option>
|
|
<option value="yue" selected>Cantonese</option>
|
|
</select>
|
|
|
|
<!-- Record toggle button -->
|
|
<button id="recordBtn">Start Recording</button>
|
|
|
|
<!-- Status indicator -->
|
|
<div id="status">Ready</div>
|
|
|
|
<!-- Transcript display -->
|
|
<div id="transcript"></div>
|
|
|
|
<script>
|
|
// ─── Configuration ──────────────────────────────────────────────────────────
|
|
const WS_PATH = '/ws/asr/session-1'; // video_id from URL or session
|
|
const WS_BASE = `${location.protocol === 'https:' ? 'wss:' : 'ws:'}//${location.host}${WS_PATH}`;
|
|
|
|
// ─── State ──────────────────────────────────────────────────────────────────
|
|
let ws = null;
|
|
let audioContext = null;
|
|
let processor = null; // ScriptProcessorNode
|
|
let stream = null; // MediaStream
|
|
let isRecording = false;
|
|
|
|
// ─── DOM Refs ───────────────────────────────────────────────────────────────
|
|
const recordBtn = document.getElementById('recordBtn');
|
|
const langSelect = document.getElementById('langSelect');
|
|
const statusEl = document.getElementById('status');
|
|
const transcriptEl = document.getElementById('transcript');
|
|
|
|
// ─── WebSocket URL Builder ──────────────────────────────────────────────────
|
|
function getWSURL() {
|
|
const lang = langSelect.value;
|
|
return lang !== 'auto' ? `${WS_BASE}?language=${lang}` : WS_BASE;
|
|
}
|
|
|
|
// ─── Status Helper ──────────────────────────────────────────────────────────
|
|
function setStatus(text) {
|
|
statusEl.textContent = text;
|
|
}
|
|
|
|
// ─── Audio Capture Setup ────────────────────────────────────────────────────
|
|
async function startRecording() {
|
|
try {
|
|
// Step 1: Get microphone access
|
|
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
|
|
// Step 2: Create AudioContext at EXACTLY 16kHz (required by DashScope)
|
|
audioContext = new AudioContext({ sampleRate: 16000 });
|
|
|
|
// Step 3: Create media stream source
|
|
const source = audioContext.createMediaStreamSource(stream);
|
|
|
|
// Step 4: Create ScriptProcessor for raw PCM access
|
|
// Buffer size 4096 → ~256ms chunks at 16kHz (lower = more responsive)
|
|
processor = audioContext.createScriptProcessor(4096, 1, 1);
|
|
|
|
// Step 5: Connect WebSocket
|
|
ws = new WebSocket(getWSURL());
|
|
|
|
ws.onopen = () => {
|
|
isRecording = true;
|
|
recordBtn.textContent = 'Stop Recording';
|
|
setStatus('Listening...');
|
|
};
|
|
|
|
ws.onmessage = (e) => {
|
|
const { full_text, language, is_final } = JSON.parse(e.data);
|
|
// Display transcript — in our React app, this goes into QueryInput
|
|
transcriptEl.textContent = full_text || '';
|
|
};
|
|
|
|
ws.onerror = () => setStatus('WebSocket error');
|
|
ws.onclose = () => {
|
|
isRecording = false;
|
|
recordBtn.textContent = 'Start Recording';
|
|
setStatus('Disconnected');
|
|
};
|
|
|
|
// Step 6: Audio processing callback — fires every ~256ms
|
|
processor.onaudioprocess = (e) => {
|
|
// Get raw Float32 samples from input channel 0
|
|
const float32Data = e.inputBuffer.getChannelData(0);
|
|
|
|
// Send raw Float32 bytes to backend (NOT base64, NOT WAV)
|
|
// The .buffer property gives us an ArrayBuffer of the Float32Array
|
|
if (ws && ws.readyState === WebSocket.OPEN && isRecording) {
|
|
ws.send(float32Data.buffer);
|
|
}
|
|
};
|
|
|
|
// Step 7: Connect the audio graph
|
|
source.connect(processor);
|
|
processor.connect(audioContext.destination); // Required: prevents auto-mute
|
|
|
|
} catch (err) {
|
|
setStatus('Microphone access denied');
|
|
console.error('Audio capture error:', err);
|
|
}
|
|
}
|
|
|
|
// ─── Stop Recording ─────────────────────────────────────────────────────────
|
|
function stopRecording() {
|
|
isRecording = false;
|
|
|
|
// Clean up audio graph
|
|
processor?.disconnect();
|
|
stream?.getTracks().forEach(t => t.stop());
|
|
|
|
// Close WebSocket
|
|
ws?.close();
|
|
|
|
recordBtn.textContent = 'Start Recording';
|
|
setStatus('Ready');
|
|
}
|
|
|
|
// ─── Toggle Recording ───────────────────────────────────────────────────────
|
|
recordBtn.addEventListener('click', () => {
|
|
if (isRecording) {
|
|
stopRecording();
|
|
} else {
|
|
startRecording();
|
|
}
|
|
});
|
|
</script>
|
|
</body>
|
|
</html>
|