Spaces:
Running
Running
import { | |
LitElement, | |
html, | |
} from "https://cdn.jsdelivr.net/gh/lit/dist@3/core/lit-core.min.js"; | |
class AudioRecorder extends LitElement { | |
static properties = { | |
dataEvent: { type: String }, | |
stateChangeEvent: { type: String }, | |
state: { type: String }, | |
isRecording: { type: Boolean }, | |
debugBuffer: { state: true }, | |
debug: { type: Boolean }, | |
voiceDetectionEnabled: { type: Boolean }, | |
voiceThreshold: { type: Number }, | |
voiceHoldTime: { type: Number }, | |
}; | |
constructor() { | |
super(); | |
this.debug = false; | |
this.mediaStream = null; | |
this.audioContext = null; | |
this.processor = null; | |
this.isStreaming = false; | |
this.isRecording = false; | |
this.isInitializing = false; | |
this.sequenceNumber = 0; | |
this.debugBuffer = []; | |
this.debugBufferSize = 50; | |
this.targetSampleRate = 16000; | |
// Voice detection parameters | |
this.voiceDetectionEnabled = true; // Enable by default | |
this.voiceThreshold = 0.01; // RMS threshold for voice detection | |
this.voiceHoldTime = 500; // Time to hold voice detection state in ms | |
this.lastVoiceDetectedTime = 0; // Last time voice was detected | |
this.isVoiceDetected = false; // Current voice detection state | |
this.consecutiveSilentFrames = 0; // Counter for silent frames | |
this.silenceThreshold = 10; // Number of silent frames before cutting off | |
this.onGeminiLiveStarted = (e) => { | |
if (this.isRecording) { | |
this.startStreaming(); | |
} | |
}; | |
this.onGeminiLiveStopped = (e) => { | |
this.stop(); | |
}; | |
} | |
connectedCallback() { | |
super.connectedCallback(); | |
window.addEventListener( | |
"gemini-live-api-started", | |
this.onGeminiLiveStarted | |
); | |
window.addEventListener( | |
"gemini-live-api-stopped", | |
this.onGeminiLiveStopped | |
); | |
} | |
disconnectedCallback() { | |
super.disconnectedCallback(); | |
this.stop(); | |
window.removeEventListener( | |
"gemini-live-api-started", | |
this.onAudioInputReceived | |
); | |
window.removeEventListener( | |
"gemini-live-api-stopped", | |
this.onGeminiLiveStopped | |
); | |
} | |
firstUpdated() { | |
if (this.state !== "disabled") { | |
this.startStreaming(); | |
} | |
} | |
log(...args) { | |
if (this.debug) { | |
console.log(...args); | |
} | |
} | |
warn(...args) { | |
if (this.debug) { | |
console.warn(...args); | |
} | |
} | |
error(...args) { | |
if (this.debug) { | |
console.error(...args); | |
} | |
} | |
isVoiceFrame(audioData) { | |
// Calculate RMS of the audio frame | |
let sumSquares = 0; | |
for (let i = 0; i < audioData.length; i++) { | |
sumSquares += audioData[i] * audioData[i]; | |
} | |
const rms = Math.sqrt(sumSquares / audioData.length); | |
const now = Date.now(); | |
// Check if we detect voice in this frame | |
if (rms > this.voiceThreshold) { | |
this.lastVoiceDetectedTime = now; | |
this.consecutiveSilentFrames = 0; | |
this.isVoiceDetected = true; | |
return true; | |
} | |
// Check if we're still within the hold time | |
if (now - this.lastVoiceDetectedTime < this.voiceHoldTime) { | |
return true; | |
} | |
// Increment silent frames counter | |
this.consecutiveSilentFrames++; | |
// If we've seen enough silent frames, mark as silent | |
if (this.consecutiveSilentFrames > this.silenceThreshold) { | |
this.isVoiceDetected = false; | |
} | |
return this.isVoiceDetected; | |
} | |
async startStreaming() { | |
if (this.state === "disabled") { | |
this.dispatchEvent(new MesopEvent(this.stateChangeEvent, "initializing")); | |
} | |
this.isInitializing = true; | |
const initialized = await this.initialize(); | |
this.isInitializing = false; | |
if (initialized) { | |
this.isRecording = true; | |
this.dispatchEvent(new MesopEvent(this.stateChangeEvent, "recording")); | |
this.start(); | |
} | |
} | |
async initialize() { | |
try { | |
// First check what sample rates are supported with echo cancellation | |
const testStream = await navigator.mediaDevices.getUserMedia({ | |
audio: { | |
echoCancellation: true, | |
noiseSuppression: true, | |
autoGainControl: true, | |
}, | |
video: false, | |
}); | |
// Get the actual sample rate from the system | |
const systemTrack = testStream.getAudioTracks()[0]; | |
const settings = systemTrack.getSettings(); | |
this.log("System audio settings:", settings); | |
// Clean up the test stream | |
testStream.getTracks().forEach((track) => track.stop()); | |
// Now create the real stream using the system's capabilities | |
this.mediaStream = await navigator.mediaDevices.getUserMedia({ | |
audio: { | |
channelCount: 1, | |
sampleRate: settings.sampleRate, | |
echoCancellation: true, | |
noiseSuppression: true, | |
autoGainControl: true, | |
echoCancellationType: "system", | |
latency: 0, | |
}, | |
video: false, | |
}); | |
// Log the actual constraints that were applied | |
const audioTrack = this.mediaStream.getAudioTracks()[0]; | |
const actualConstraints = audioTrack.getSettings(); | |
this.log("Applied audio constraints:", actualConstraints); | |
// Set up audio context matching the system rate | |
this.audioContext = new AudioContext({ | |
sampleRate: settings.sampleRate, | |
}); | |
this.log( | |
"AudioContext created with sample rate:", | |
this.audioContext.sampleRate | |
); | |
const micSource = this.audioContext.createMediaStreamSource( | |
this.mediaStream | |
); | |
this.processor = this.audioContext.createScriptProcessor(4096, 1, 1); | |
// Connect the audio nodes | |
micSource.connect(this.processor); | |
this.processor.connect(this.audioContext.destination); | |
return true; | |
} catch (error) { | |
this.error("Error initializing audio streamer:", error); | |
return false; | |
} | |
} | |
downsampleBuffer(buffer, originalSampleRate) { | |
if (originalSampleRate === this.targetSampleRate) { | |
return buffer; | |
} | |
const ratio = originalSampleRate / this.targetSampleRate; | |
const newLength = Math.floor(buffer.length / ratio); | |
const result = new Float32Array(newLength); | |
for (let i = 0; i < newLength; i++) { | |
const startIndex = Math.floor(i * ratio); | |
const endIndex = Math.floor((i + 1) * ratio); | |
let sum = 0; | |
let count = 0; | |
for (let j = startIndex; j < endIndex && j < buffer.length; j++) { | |
sum += buffer[j]; | |
count++; | |
} | |
result[i] = count > 0 ? sum / count : 0; | |
} | |
this.log("Downsampling details:", { | |
originalRate: originalSampleRate, | |
targetRate: this.targetSampleRate, | |
originalLength: buffer.length, | |
newLength: result.length, | |
actualRatio: buffer.length / result.length, | |
}); | |
return result; | |
} | |
addAudioDebugger(sourceNode, label) { | |
if (!this.debug) return; | |
const analyser = this.audioContext.createAnalyser(); | |
analyser.fftSize = 2048; | |
sourceNode.connect(analyser); | |
const bufferLength = analyser.frequencyBinCount; | |
const dataArray = new Float32Array(bufferLength); | |
this.debugInterval = setInterval(() => { | |
if (!this.isStreaming) return; | |
analyser.getFloatTimeDomainData(dataArray); | |
let rms = 0; | |
for (let i = 0; i < bufferLength; i++) { | |
rms += dataArray[i] * dataArray[i]; | |
} | |
rms = Math.sqrt(rms / bufferLength); | |
this.log(`${label} RMS Level: ${rms.toFixed(6)}`); | |
}, 1000); | |
} | |
start() { | |
this.isStreaming = true; | |
this.debugBuffer = []; | |
this.lastVoiceDetectedTime = 0; | |
this.isVoiceDetected = false; | |
this.consecutiveSilentFrames = 0; | |
this.processor.onaudioprocess = (event) => { | |
if (!this.isStreaming) return; | |
const inputData = event.inputBuffer.getChannelData(0); | |
const originalSampleRate = event.inputBuffer.sampleRate; | |
// Log initial processing details if needed | |
if (this.sequenceNumber === 0) { | |
this.log("Audio Processing Details:", { | |
bufferSize: this.processor.bufferSize, | |
inputChannels: this.processor.numberOfInputs, | |
outputChannels: this.processor.numberOfOutputs, | |
originalSampleRate: originalSampleRate, | |
targetSampleRate: this.targetSampleRate, | |
length: inputData.length, | |
timestamp: event.timeStamp, | |
}); | |
} | |
// Check for voice activity if enabled | |
if (this.voiceDetectionEnabled && !this.isVoiceFrame(inputData)) { | |
// Skip this frame if no voice is detected | |
this.sequenceNumber++; // Still increment to maintain sequence | |
return; | |
} | |
const downsampledData = this.downsampleBuffer( | |
inputData, | |
originalSampleRate | |
); | |
const processedData = new Float32Array(downsampledData.length); | |
const gain = 5.0; | |
for (let i = 0; i < downsampledData.length; i++) { | |
processedData[i] = downsampledData[i] * gain; | |
} | |
// Debug logging | |
if (this.sequenceNumber % 50 === 0 && this.debug) { | |
const stats = { | |
originalLength: inputData.length, | |
downsampledLength: downsampledData.length, | |
maxValue: Math.max(...processedData), | |
minValue: Math.min(...processedData), | |
originalSampleRate, | |
targetSampleRate: this.targetSampleRate, | |
isVoiceDetected: this.isVoiceDetected, | |
}; | |
this.log("Audio buffer stats:", stats); | |
} | |
// Store in debug buffer | |
this.debugBuffer.push(processedData); | |
if (this.debugBuffer.length > this.debugBufferSize) { | |
this.debugBuffer.shift(); | |
} | |
// Audio level monitoring | |
let rms = 0; | |
for (let i = 0; i < processedData.length; i++) { | |
rms += processedData[i] * processedData[i]; | |
} | |
rms = Math.sqrt(rms / processedData.length); | |
if (this.sequenceNumber % 10 === 0 && this.debug) { | |
this.log( | |
`Audio Level (RMS): ${rms.toFixed(4)}, Voice Detected: ${ | |
this.isVoiceDetected | |
}` | |
); | |
if (rms < 0.0001) { | |
this.warn( | |
"Warning: Very low audio level detected. Check if microphone is working." | |
); | |
} | |
} | |
// Convert to Int16Array for transmission | |
const intData = new Int16Array(processedData.length); | |
for (let i = 0; i < processedData.length; i++) { | |
intData[i] = Math.max( | |
-32768, | |
Math.min(32767, processedData[i] * 32768) | |
); | |
if (this.sequenceNumber % 100 === 0 && i < 10 && this.debug) { | |
this.log( | |
`Sample ${i}: Float=${processedData[i].toFixed(4)}, Int16=${ | |
intData[i] | |
}` | |
); | |
} | |
} | |
// Convert to base64 and dispatch | |
const bytes = new Uint8Array(intData.buffer); | |
const base64Data = btoa( | |
Array.from(bytes) | |
.map((byte) => String.fromCharCode(byte)) | |
.join("") | |
); | |
this.dispatchEvent( | |
new MesopEvent(this.dataEvent, { | |
sequence: this.sequenceNumber++, | |
sampleRate: this.targetSampleRate, | |
data: base64Data, | |
isVoice: this.isVoiceDetected, | |
}) | |
); | |
this.dispatchEvent( | |
new CustomEvent("audio-input-received", { | |
detail: { data: base64Data }, | |
// Allow event to cross shadow DOM boundaries (both need to be true) | |
bubbles: true, | |
composed: true, | |
}) | |
); | |
}; | |
return true; | |
} | |
stop() { | |
this.isStreaming = false; | |
this.isRecording = false; | |
this.dispatchEvent(new MesopEvent(this.stateChangeEvent, "disabled")); | |
if (this.debugInterval) { | |
clearInterval(this.debugInterval); | |
} | |
if (this.processor) { | |
this.processor.onaudioprocess = null; | |
} | |
if (this.mediaStream) { | |
this.mediaStream.getTracks().forEach((track) => track.stop()); | |
} | |
if (this.audioContext) { | |
this.audioContext.close(); | |
} | |
} | |
async playbackDebug() { | |
if (!this.debugBuffer.length) { | |
this.log("No audio data available for playback"); | |
return; | |
} | |
const playbackContext = new AudioContext(); | |
const systemSampleRate = playbackContext.sampleRate; | |
const totalSamples16k = | |
this.debugBuffer.length * this.debugBuffer[0].length; | |
const upsampledLength = Math.round( | |
totalSamples16k * (systemSampleRate / this.targetSampleRate) | |
); | |
const audioBuffer = playbackContext.createBuffer( | |
1, | |
upsampledLength, | |
systemSampleRate | |
); | |
const channelData = audioBuffer.getChannelData(0); | |
const combined16kBuffer = new Float32Array(totalSamples16k); | |
let offset = 0; | |
for (let i = 0; i < this.debugBuffer.length; i++) { | |
combined16kBuffer.set(this.debugBuffer[i], offset); | |
offset += this.debugBuffer[i].length; | |
} | |
const ratio = this.targetSampleRate / systemSampleRate; | |
for (let i = 0; i < upsampledLength; i++) { | |
const position = i * ratio; | |
const index = Math.floor(position); | |
const decimal = position - index; | |
const sample1 = combined16kBuffer[index] || 0; | |
const sample2 = combined16kBuffer[index + 1] || sample1; | |
channelData[i] = sample1 + decimal * (sample2 - sample1); | |
} | |
const source = playbackContext.createBufferSource(); | |
source.buffer = audioBuffer; | |
source.connect(playbackContext.destination); | |
source.start(); | |
this.log("Playing debug audio at system rate...", { | |
systemSampleRate, | |
originalLength: totalSamples16k, | |
upsampledLength, | |
}); | |
source.onended = () => { | |
this.log("Debug playback finished"); | |
playbackContext.close(); | |
}; | |
} | |
render() { | |
if (this.isInitializing) { | |
return html`<span><slot></slot></span>`; | |
} | |
if (this.isRecording) { | |
return html`<span @click="${this.stop}"><slot></slot></span> `; | |
} | |
return html`<span @click="${this.startStreaming}"><slot></slot></span>`; | |
} | |
} | |
customElements.define("audio-recorder", AudioRecorder); | |