Richard
Initial commit
09ed935
raw
history blame
14 kB
import {
LitElement,
html,
} from "https://cdn.jsdelivr.net/gh/lit/dist@3/core/lit-core.min.js";
class AudioRecorder extends LitElement {
static properties = {
dataEvent: { type: String },
stateChangeEvent: { type: String },
state: { type: String },
isRecording: { type: Boolean },
debugBuffer: { state: true },
debug: { type: Boolean },
voiceDetectionEnabled: { type: Boolean },
voiceThreshold: { type: Number },
voiceHoldTime: { type: Number },
};
constructor() {
super();
this.debug = false;
this.mediaStream = null;
this.audioContext = null;
this.processor = null;
this.isStreaming = false;
this.isRecording = false;
this.isInitializing = false;
this.sequenceNumber = 0;
this.debugBuffer = [];
this.debugBufferSize = 50;
this.targetSampleRate = 16000;
// Voice detection parameters
this.voiceDetectionEnabled = true; // Enable by default
this.voiceThreshold = 0.01; // RMS threshold for voice detection
this.voiceHoldTime = 500; // Time to hold voice detection state in ms
this.lastVoiceDetectedTime = 0; // Last time voice was detected
this.isVoiceDetected = false; // Current voice detection state
this.consecutiveSilentFrames = 0; // Counter for silent frames
this.silenceThreshold = 10; // Number of silent frames before cutting off
this.onGeminiLiveStarted = (e) => {
if (this.isRecording) {
this.startStreaming();
}
};
this.onGeminiLiveStopped = (e) => {
this.stop();
};
}
connectedCallback() {
super.connectedCallback();
window.addEventListener(
"gemini-live-api-started",
this.onGeminiLiveStarted
);
window.addEventListener(
"gemini-live-api-stopped",
this.onGeminiLiveStopped
);
}
disconnectedCallback() {
super.disconnectedCallback();
this.stop();
window.removeEventListener(
"gemini-live-api-started",
this.onAudioInputReceived
);
window.removeEventListener(
"gemini-live-api-stopped",
this.onGeminiLiveStopped
);
}
firstUpdated() {
if (this.state !== "disabled") {
this.startStreaming();
}
}
log(...args) {
if (this.debug) {
console.log(...args);
}
}
warn(...args) {
if (this.debug) {
console.warn(...args);
}
}
error(...args) {
if (this.debug) {
console.error(...args);
}
}
isVoiceFrame(audioData) {
// Calculate RMS of the audio frame
let sumSquares = 0;
for (let i = 0; i < audioData.length; i++) {
sumSquares += audioData[i] * audioData[i];
}
const rms = Math.sqrt(sumSquares / audioData.length);
const now = Date.now();
// Check if we detect voice in this frame
if (rms > this.voiceThreshold) {
this.lastVoiceDetectedTime = now;
this.consecutiveSilentFrames = 0;
this.isVoiceDetected = true;
return true;
}
// Check if we're still within the hold time
if (now - this.lastVoiceDetectedTime < this.voiceHoldTime) {
return true;
}
// Increment silent frames counter
this.consecutiveSilentFrames++;
// If we've seen enough silent frames, mark as silent
if (this.consecutiveSilentFrames > this.silenceThreshold) {
this.isVoiceDetected = false;
}
return this.isVoiceDetected;
}
async startStreaming() {
if (this.state === "disabled") {
this.dispatchEvent(new MesopEvent(this.stateChangeEvent, "initializing"));
}
this.isInitializing = true;
const initialized = await this.initialize();
this.isInitializing = false;
if (initialized) {
this.isRecording = true;
this.dispatchEvent(new MesopEvent(this.stateChangeEvent, "recording"));
this.start();
}
}
async initialize() {
try {
// First check what sample rates are supported with echo cancellation
const testStream = await navigator.mediaDevices.getUserMedia({
audio: {
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
},
video: false,
});
// Get the actual sample rate from the system
const systemTrack = testStream.getAudioTracks()[0];
const settings = systemTrack.getSettings();
this.log("System audio settings:", settings);
// Clean up the test stream
testStream.getTracks().forEach((track) => track.stop());
// Now create the real stream using the system's capabilities
this.mediaStream = await navigator.mediaDevices.getUserMedia({
audio: {
channelCount: 1,
sampleRate: settings.sampleRate,
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
echoCancellationType: "system",
latency: 0,
},
video: false,
});
// Log the actual constraints that were applied
const audioTrack = this.mediaStream.getAudioTracks()[0];
const actualConstraints = audioTrack.getSettings();
this.log("Applied audio constraints:", actualConstraints);
// Set up audio context matching the system rate
this.audioContext = new AudioContext({
sampleRate: settings.sampleRate,
});
this.log(
"AudioContext created with sample rate:",
this.audioContext.sampleRate
);
const micSource = this.audioContext.createMediaStreamSource(
this.mediaStream
);
this.processor = this.audioContext.createScriptProcessor(4096, 1, 1);
// Connect the audio nodes
micSource.connect(this.processor);
this.processor.connect(this.audioContext.destination);
return true;
} catch (error) {
this.error("Error initializing audio streamer:", error);
return false;
}
}
downsampleBuffer(buffer, originalSampleRate) {
if (originalSampleRate === this.targetSampleRate) {
return buffer;
}
const ratio = originalSampleRate / this.targetSampleRate;
const newLength = Math.floor(buffer.length / ratio);
const result = new Float32Array(newLength);
for (let i = 0; i < newLength; i++) {
const startIndex = Math.floor(i * ratio);
const endIndex = Math.floor((i + 1) * ratio);
let sum = 0;
let count = 0;
for (let j = startIndex; j < endIndex && j < buffer.length; j++) {
sum += buffer[j];
count++;
}
result[i] = count > 0 ? sum / count : 0;
}
this.log("Downsampling details:", {
originalRate: originalSampleRate,
targetRate: this.targetSampleRate,
originalLength: buffer.length,
newLength: result.length,
actualRatio: buffer.length / result.length,
});
return result;
}
addAudioDebugger(sourceNode, label) {
if (!this.debug) return;
const analyser = this.audioContext.createAnalyser();
analyser.fftSize = 2048;
sourceNode.connect(analyser);
const bufferLength = analyser.frequencyBinCount;
const dataArray = new Float32Array(bufferLength);
this.debugInterval = setInterval(() => {
if (!this.isStreaming) return;
analyser.getFloatTimeDomainData(dataArray);
let rms = 0;
for (let i = 0; i < bufferLength; i++) {
rms += dataArray[i] * dataArray[i];
}
rms = Math.sqrt(rms / bufferLength);
this.log(`${label} RMS Level: ${rms.toFixed(6)}`);
}, 1000);
}
start() {
this.isStreaming = true;
this.debugBuffer = [];
this.lastVoiceDetectedTime = 0;
this.isVoiceDetected = false;
this.consecutiveSilentFrames = 0;
this.processor.onaudioprocess = (event) => {
if (!this.isStreaming) return;
const inputData = event.inputBuffer.getChannelData(0);
const originalSampleRate = event.inputBuffer.sampleRate;
// Log initial processing details if needed
if (this.sequenceNumber === 0) {
this.log("Audio Processing Details:", {
bufferSize: this.processor.bufferSize,
inputChannels: this.processor.numberOfInputs,
outputChannels: this.processor.numberOfOutputs,
originalSampleRate: originalSampleRate,
targetSampleRate: this.targetSampleRate,
length: inputData.length,
timestamp: event.timeStamp,
});
}
// Check for voice activity if enabled
if (this.voiceDetectionEnabled && !this.isVoiceFrame(inputData)) {
// Skip this frame if no voice is detected
this.sequenceNumber++; // Still increment to maintain sequence
return;
}
const downsampledData = this.downsampleBuffer(
inputData,
originalSampleRate
);
const processedData = new Float32Array(downsampledData.length);
const gain = 5.0;
for (let i = 0; i < downsampledData.length; i++) {
processedData[i] = downsampledData[i] * gain;
}
// Debug logging
if (this.sequenceNumber % 50 === 0 && this.debug) {
const stats = {
originalLength: inputData.length,
downsampledLength: downsampledData.length,
maxValue: Math.max(...processedData),
minValue: Math.min(...processedData),
originalSampleRate,
targetSampleRate: this.targetSampleRate,
isVoiceDetected: this.isVoiceDetected,
};
this.log("Audio buffer stats:", stats);
}
// Store in debug buffer
this.debugBuffer.push(processedData);
if (this.debugBuffer.length > this.debugBufferSize) {
this.debugBuffer.shift();
}
// Audio level monitoring
let rms = 0;
for (let i = 0; i < processedData.length; i++) {
rms += processedData[i] * processedData[i];
}
rms = Math.sqrt(rms / processedData.length);
if (this.sequenceNumber % 10 === 0 && this.debug) {
this.log(
`Audio Level (RMS): ${rms.toFixed(4)}, Voice Detected: ${
this.isVoiceDetected
}`
);
if (rms < 0.0001) {
this.warn(
"Warning: Very low audio level detected. Check if microphone is working."
);
}
}
// Convert to Int16Array for transmission
const intData = new Int16Array(processedData.length);
for (let i = 0; i < processedData.length; i++) {
intData[i] = Math.max(
-32768,
Math.min(32767, processedData[i] * 32768)
);
if (this.sequenceNumber % 100 === 0 && i < 10 && this.debug) {
this.log(
`Sample ${i}: Float=${processedData[i].toFixed(4)}, Int16=${
intData[i]
}`
);
}
}
// Convert to base64 and dispatch
const bytes = new Uint8Array(intData.buffer);
const base64Data = btoa(
Array.from(bytes)
.map((byte) => String.fromCharCode(byte))
.join("")
);
this.dispatchEvent(
new MesopEvent(this.dataEvent, {
sequence: this.sequenceNumber++,
sampleRate: this.targetSampleRate,
data: base64Data,
isVoice: this.isVoiceDetected,
})
);
this.dispatchEvent(
new CustomEvent("audio-input-received", {
detail: { data: base64Data },
// Allow event to cross shadow DOM boundaries (both need to be true)
bubbles: true,
composed: true,
})
);
};
return true;
}
stop() {
this.isStreaming = false;
this.isRecording = false;
this.dispatchEvent(new MesopEvent(this.stateChangeEvent, "disabled"));
if (this.debugInterval) {
clearInterval(this.debugInterval);
}
if (this.processor) {
this.processor.onaudioprocess = null;
}
if (this.mediaStream) {
this.mediaStream.getTracks().forEach((track) => track.stop());
}
if (this.audioContext) {
this.audioContext.close();
}
}
async playbackDebug() {
if (!this.debugBuffer.length) {
this.log("No audio data available for playback");
return;
}
const playbackContext = new AudioContext();
const systemSampleRate = playbackContext.sampleRate;
const totalSamples16k =
this.debugBuffer.length * this.debugBuffer[0].length;
const upsampledLength = Math.round(
totalSamples16k * (systemSampleRate / this.targetSampleRate)
);
const audioBuffer = playbackContext.createBuffer(
1,
upsampledLength,
systemSampleRate
);
const channelData = audioBuffer.getChannelData(0);
const combined16kBuffer = new Float32Array(totalSamples16k);
let offset = 0;
for (let i = 0; i < this.debugBuffer.length; i++) {
combined16kBuffer.set(this.debugBuffer[i], offset);
offset += this.debugBuffer[i].length;
}
const ratio = this.targetSampleRate / systemSampleRate;
for (let i = 0; i < upsampledLength; i++) {
const position = i * ratio;
const index = Math.floor(position);
const decimal = position - index;
const sample1 = combined16kBuffer[index] || 0;
const sample2 = combined16kBuffer[index + 1] || sample1;
channelData[i] = sample1 + decimal * (sample2 - sample1);
}
const source = playbackContext.createBufferSource();
source.buffer = audioBuffer;
source.connect(playbackContext.destination);
source.start();
this.log("Playing debug audio at system rate...", {
systemSampleRate,
originalLength: totalSamples16k,
upsampledLength,
});
source.onended = () => {
this.log("Debug playback finished");
playbackContext.close();
};
}
render() {
if (this.isInitializing) {
return html`<span><slot></slot></span>`;
}
if (this.isRecording) {
return html`<span @click="${this.stop}"><slot></slot></span> `;
}
return html`<span @click="${this.startStreaming}"><slot></slot></span>`;
}
}
customElements.define("audio-recorder", AudioRecorder);