Spaces:

atlury
/

digitalhuman

Running

App Files Files Community

atlury commited on Sep 15, 2024

Commit

c48a715

verified ·

1 Parent(s): 8e5cfea

Upload 6 files

Browse files

Files changed (6) hide show

MicrophoneAudio.ts +119 -0
Silero.ts +155 -0
SpeechChunks.ts +126 -0
VoiceActivityDetector.ts +109 -0
globals.css +33 -0
index.html +64 -0

MicrophoneAudio.ts ADDED Viewed

	@@ -0,0 +1,119 @@

+class MicrophoneAudio {
+    constructor(options) {
+        console.log('Initializing MicrophoneAudio');
+        this.options = {
+            sampleRate: 16000,
+            channels: 1,
+            ...options,
+        };
+        this.stream = null;
+        this.audioContext = null;
+        this.sourceNode = null;
+        this.workletNode = null;
+        this.buffer = new Float32Array();
+        console.log(`MicrophoneAudio options: ${JSON.stringify(this.options)}`);
+    }
+    getDeviceId() {
+        console.log('Getting device ID');
+        return navigator.mediaDevices.getUserMedia({ audio: true }).then((stream) => {
+            const deviceId = stream.getTracks()[0].getSettings().deviceId;
+            console.log("The device Id is", deviceId);
+            return deviceId;
+        });
+    }
+    async start() {
+        console.log('Starting MicrophoneAudio');
+        try {
+            this.stream = await navigator.mediaDevices.getUserMedia({
+                audio: {
+                    sampleRate: this.options.sampleRate,
+                    channelCount: this.options.channels,
+                },
+            });
+            console.log('MediaStream acquired');
+            this.getDeviceId().then((deviceId) => {
+                console.log("The device Id is", deviceId);
+            });
+            this.audioContext = new AudioContext({
+                sampleRate: this.options.sampleRate,
+            });
+            await this.audioContext.audioWorklet.addModule(
+                URL.createObjectURL(new Blob([`
+                    class AudioProcessor extends AudioWorkletProcessor {
+                        constructor() {
+                            super();
+                            this.buffer = new Float32Array();
+                        }
+                        process(inputs, outputs, parameters) {
+                            const input = inputs[0];
+                            const channelData = input[0];
+                            this.buffer = Float32Array.from([...this.buffer, ...channelData]);
+                            while (this.buffer.length >= ${this.options.windowSizeSamples}) {
+                                const chunk = this.buffer.slice(0, ${this.options.windowSizeSamples});
+                                this.port.postMessage(chunk);
+                                this.buffer = this.buffer.slice(${this.options.windowSizeSamples});
+                            }
+                            return true;
+                        }
+                    }
+                    registerProcessor('audio-processor', AudioProcessor);
+                `], { type: 'application/javascript' }))
+            );
+            this.sourceNode = this.audioContext.createMediaStreamSource(this.stream);
+            this.workletNode = new AudioWorkletNode(this.audioContext, 'audio-processor');
+            this.workletNode.port.onmessage = (event) => {
+                this.options.onAudioData(event.data);
+            };
+            this.sourceNode.connect(this.workletNode);
+            this.workletNode.connect(this.audioContext.destination);
+            console.log('AudioWorklet added and connected');
+        } catch (error) {
+            console.error('Error starting microphone:', error);
+            throw error;
+        }
+    }
+    stop() {
+        console.log('Stopping MicrophoneAudio');
+        if (this.workletNode) {
+            this.workletNode.port.postMessage('flush');
+            this.workletNode.disconnect();
+            this.workletNode = null;
+        }
+        if (this.sourceNode) {
+            this.sourceNode.disconnect();
+            this.sourceNode = null;
+        }
+        if (this.audioContext) {
+            this.audioContext.close();
+            this.audioContext = null;
+        }
+        if (this.stream) {
+            this.stream.getTracks().forEach((track) => track.stop());
+            this.stream = null;
+        }
+        if (this.buffer.length > 0) {
+            this.options.onAudioData(this.buffer);
+            this.buffer = new Float32Array();
+        }
+        console.log('MicrophoneAudio stopped');
+    }
+}
+export default MicrophoneAudio;

Silero.ts ADDED Viewed

	@@ -0,0 +1,155 @@

+class OnnxWrapper {
+    constructor(path, force_onnx_cpu = true) {
+        console.log(`Initializing OnnxWrapper with path: ${path}`);
+        this.sessionReady = this.initSession(path, force_onnx_cpu);
+        this.resetStates();
+        this.sample_rates = [8000, 16000];
+    }
+    async ready() {
+        console.log('Waiting for OnnxWrapper session to be ready');
+        await this.sessionReady;
+        console.log('OnnxWrapper session is ready');
+    }
+    async initSession(path, force_onnx_cpu) {
+        console.log(`Initializing ONNX session with force_onnx_cpu: ${force_onnx_cpu}`);
+        const options = {
+            executionProviders: force_onnx_cpu ? ['wasm'] : ['webgl', 'wasm'],
+            graphOptimizationLevel: 'all',
+            executionMode: 'sequential',
+            enableCpuMemArena: true,
+            enableMemPattern: true,
+            extra: {
+                session: {
+                    intra_op_num_threads: 1,
+                    inter_op_num_threads: 1,
+                }
+            }
+        };
+        this.session = await ort.InferenceSession.create(path, options);
+        console.log('ONNX session created successfully');
+    }
+    _validate_input(x, sr) {
+        if (!Array.isArray(x[0])) {
+            x = [x];
+        }
+        if (x.length > 2) {
+            throw new Error(`Too many dimensions for input audio chunk ${x.length}`);
+        }
+        if (sr !== 16000 && (sr % 16000 === 0)) {
+            const step = Math.floor(sr / 16000);
+            x = x.map(row => row.filter((_, i) => i % step === 0));
+            sr = 16000;
+        }
+        if (!this.sample_rates.includes(sr)) {
+            throw new Error(`Supported sampling rates: ${this.sample_rates} (or multiply of 16000)`);
+        }
+        if (sr / x[0].length > 31.25) {
+            throw new Error("Input audio chunk is too short");
+        }
+        return [x, sr];
+    }
+    resetStates(batch_size = 1) {
+        console.log(`Resetting states with batch_size: ${batch_size}`);
+        this._state = Array(2).fill(0).map(() => Array(batch_size * 128).fill(0));
+        this._context = [];
+        this._last_sr = 0;
+        this._last_batch_size = 0;
+    }
+    async call(x, sr) {
+        console.log(`Calling model with input shape: [${x.length}, ${x[0].length}], sample rate: ${sr}`);
+        await this.ready();
+        [x, sr] = this._validate_input(x, sr);
+        const num_samples = sr === 16000 ? 512 : 256;
+        if (x[0].length !== num_samples) {
+            throw new Error(`Provided number of samples is ${x[0].length} (Supported values: 256 for 8000 sample rate, 512 for 16000)`);
+        }
+        const batch_size = x.length;
+        const context_size = sr === 16000 ? 64 : 32;
+        if (!this._last_batch_size) {
+            this.resetStates(batch_size);
+        }
+        if (this._last_sr && this._last_sr !== sr) {
+            this.resetStates(batch_size);
+        }
+        if (this._last_batch_size && this._last_batch_size !== batch_size) {
+            this.resetStates(batch_size);
+        }
+        if (this._context.length === 0) {
+            this._context = Array(batch_size * context_size).fill(0);
+        }
+        x = x.map((row, i) => [...this._context.slice(i * context_size, (i + 1) * context_size), ...row]);
+        if (sr === 8000 || sr === 16000) {
+            const inputTensor = new ort.Tensor('float32', x.flat(), [batch_size, x[0].length]);
+            const stateTensor = new ort.Tensor('float32', this._state.flat(), [2, batch_size, 128]);
+            const srTensor = new ort.Tensor('int64', [sr], []);
+            const feeds = {
+                input: inputTensor,
+                state: stateTensor,
+                sr: srTensor
+            };
+            const results = await this.session.run(feeds);
+            const outputData = results.output.data;
+            const stateData = results.stateN.data;
+            this._state = Array(2).fill(0).map((_, i) =>
+                Array.from(stateData.slice(i * batch_size * 128, (i + 1) * batch_size * 128))
+            );
+            const outputShape = results.output.dims;
+            const out = Array(outputShape[0]).fill(0).map((_, i) =>
+                Array.from(outputData.slice(i * outputShape[1], (i + 1) * outputShape[1]))
+            );
+            this._context = x.map(row => row.slice(-context_size)).flat();
+            this._last_sr = sr;
+            this._last_batch_size = batch_size;
+            console.log(`Model call completed, output shape: [${out.length}, ${out[0].length}]`);
+            return out;
+        } else {
+            throw new Error(`Unsupported sample rate: ${sr}. Supported rates are 8000 and 16000.`);
+        }
+    }
+    async audio_forward(x, sr) {
+        console.log(`Running audio_forward with input shape: [${x.length}, ${x[0].length}], sample rate: ${sr}`);
+        const outs = [];
+        [x, sr] = this._validate_input(x, sr);
+        this.resetStates();
+        const num_samples = sr === 16000 ? 512 : 256;
+        if (x[0].length % num_samples !== 0) {
+            const pad_num = num_samples - (x[0].length % num_samples);
+            x = x.map(row => [...row, ...Array(pad_num).fill(0)]);
+        }
+        for (let i = 0; i < x[0].length; i += num_samples) {
+            const wavs_batch = x.map(row => row.slice(i, i + num_samples));
+            const out_chunk = await this.call(wavs_batch, sr);
+            outs.push(out_chunk);
+        }
+        console.log(`audio_forward completed, output shape: [${outs.length}, ${outs[0].length}]`);
+        return outs.reduce((acc, curr) => acc.map((row, i) => [...row, ...curr[i]]));
+    }
+    close() {
+        console.log('Closing OnnxWrapper session');
+        this.session.release();
+    }
+}
+export default OnnxWrapper;

SpeechChunks.ts ADDED Viewed

	@@ -0,0 +1,126 @@

+import MicrophoneAudio from './MicrophoneAudio.ts';
+import { VadDetector } from './VoiceActivityDetector.ts';
+export class SpeechChunks {
+    static SAMPLE_RATE = 16000;
+    static START_THRESHOLD = 0.6;
+    static END_THRESHOLD = 0.45;
+    static MIN_SILENCE_DURATION_MS = 600;
+    static SPEECH_PAD_MS = 500;
+    static WINDOW_SIZE_SAMPLES = 512;
+    constructor(onSpeechStart, onSpeechEnd) {
+        this.chunks = [];
+        this.isSpeechActive = false;
+        this.onSpeechStart = onSpeechStart;
+        this.onSpeechEnd = onSpeechEnd;
+        console.log('SpeechChunks initialized');
+    }
+    async initialize() {
+        this.microphoneAudio = new MicrophoneAudio({
+            sampleRate: SpeechChunks.SAMPLE_RATE,
+            windowSizeSamples: SpeechChunks.WINDOW_SIZE_SAMPLES,
+            onAudioData: this.processAudioData.bind(this)
+        });
+        this.vadDetector = new VadDetector(
+            SpeechChunks.START_THRESHOLD,
+            SpeechChunks.END_THRESHOLD,
+            SpeechChunks.SAMPLE_RATE,
+            SpeechChunks.MIN_SILENCE_DURATION_MS,
+            SpeechChunks.SPEECH_PAD_MS
+        );
+    }
+    async processAudioData(audioData) {
+        console.log(`Processing audio data of length ${audioData.length}`);
+        try {
+            const result = await this.vadDetector.apply(audioData, false);
+            if (result.start !== undefined) {
+                this.isSpeechActive = true;
+                console.log('Speech start detected');
+                this.onSpeechStart();
+            } else if (result.end !== undefined) {
+                this.isSpeechActive = false;
+                console.log('Speech end detected');
+                this.onSpeechEnd(this.getBlob());
+            }
+            if (this.isSpeechActive) {
+                console.log('Adding chunk to speech');
+                this.chunks.push(Array.from(audioData));
+            }
+        } catch (error) {
+            console.error('Error processing audio data', error);
+        }
+    }
+    async start() {
+        console.log('Starting SpeechChunks');
+        await this.initialize();
+        await this.microphoneAudio.start();
+    }
+    stop() {
+        console.log('Stopping SpeechChunks');
+        this.microphoneAudio.stop();
+        this.vadDetector.reset();
+        this.isSpeechActive = false;
+    }
+    getSpeechChunks() {
+        console.log(`Returning ${this.chunks.length} speech chunks`);
+        const speechChunks = this.chunks;
+        this.chunks = [];
+        return speechChunks;
+    }
+    getBlob() {
+        console.log('Creating audio blob from speech chunks');
+        const combinedChunks = this.chunks.flat();
+        const combinedAudio = new Float32Array(combinedChunks);
+        const intData = new Int16Array(combinedAudio.length);
+        for (let i = 0; i < combinedAudio.length; i++) {
+            const s = Math.max(-1, Math.min(1, combinedAudio[i]));
+            intData[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
+        }
+        const buffer = new ArrayBuffer(44 + intData.length * 2);
+        const view = new DataView(buffer);
+        this.writeString(view, 0, 'RIFF');
+        view.setUint32(4, 36 + intData.length * 2, true);
+        this.writeString(view, 8, 'WAVE');
+        this.writeString(view, 12, 'fmt ');
+        view.setUint32(16, 16, true);
+        view.setUint16(20, 1, true);
+        view.setUint16(22, 1, true);
+        view.setUint32(24, SpeechChunks.SAMPLE_RATE, true);
+        view.setUint32(28, SpeechChunks.SAMPLE_RATE * 2, true);
+        view.setUint16(32, 2, true);
+        view.setUint16(34, 16, true);
+        this.writeString(view, 36, 'data');
+        view.setUint32(40, intData.length * 2, true);
+        for (let i = 0; i < intData.length; i++) {
+            view.setInt16(44 + i * 2, intData[i], true);
+        }
+        const blob = new Blob([buffer], { type: 'audio/wav' });
+        console.log(`Created blob of size ${blob.size} bytes`);
+        return blob;
+    }
+    writeString(view, offset, string) {
+        for (let i = 0; i < string.length; i++) {
+            view.setUint8(offset + i, string.charCodeAt(i));
+        }
+    }
+    async close() {
+        console.log('Closing SpeechChunks');
+        this.stop();
+        await this.vadDetector.close();
+    }
+}

VoiceActivityDetector.ts ADDED Viewed

	@@ -0,0 +1,109 @@

+import OnnxWrapper from './Silero.ts';
+const modelPath = "silero_vad.onnx";  // Make sure this path is correct
+export class VadDetector {
+    constructor(startThreshold, endThreshold, samplingRate, minSilenceDurationMs, speechPadMs) {
+        if (samplingRate !== 8000 && samplingRate !== 16000) {
+            throw new Error("Does not support sampling rates other than [8000, 16000]");
+        }
+        this.model = new OnnxWrapper(modelPath);
+        this.startThreshold = startThreshold;
+        this.endThreshold = endThreshold;
+        this.samplingRate = samplingRate;
+        this.minSilenceSamples = samplingRate * minSilenceDurationMs / 1000;
+        this.speechPadSamples = samplingRate * speechPadMs / 1000;
+        this.reset();
+        console.log(`VadDetector initialized with: startThreshold=${startThreshold}, endThreshold=${endThreshold}, samplingRate=${samplingRate}`);
+    }
+    reset() {
+        this.model.resetStates();
+        this.triggered = false;
+        this.tempEnd = 0;
+        this.currentSample = 0;
+        console.log('VadDetector reset');
+    }
+    async apply(data, returnSeconds) {
+        console.log(`Applying VAD to data of length ${data.length}`);
+        const windowSizeSamples = data.length;
+        this.currentSample += windowSizeSamples;
+        const rowLength = this.samplingRate === 16000 ? 512 : 256;
+		// Ensure data is the correct length
+        if (data.length < rowLength) {
+            console.warn(`Input data length (${data.length}) is less than required (${rowLength}). Padding with zeros.`);
+            data = [...data, ...new Array(rowLength - data.length).fill(0)];
+        } else if (data.length > rowLength) {
+            console.warn(`Input data length (${data.length}) is greater than required (${rowLength}). Truncating.`);
+            data = data.slice(0, rowLength);
+        }
+        const x = [Array.from(data)];
+        let speechProb;
+        try {
+            console.log(`Calling model with input shape: [${x.length}, ${x[0].length}], sample rate: ${this.samplingRate}`);
+            const result = await this.model.call(x, this.samplingRate);
+            if (result && Array.isArray(result) && result[0] && result[0][0] !== undefined) {
+                speechProb = result[0][0];
+                console.log(`Speech probability: ${speechProb}`);
+            } else {
+                throw new Error("Unexpected response from model");
+            }
+        } catch (e) {
+            console.error("Error in VadDetector.apply:", e);
+            throw new Error("Error calling the model: " + e);
+        }
+        if (speechProb >= this.startThreshold && this.tempEnd !== 0) {
+            this.tempEnd = 0;
+        }
+        if (speechProb >= this.startThreshold && !this.triggered) {
+            this.triggered = true;
+            let speechStart = Math.max(this.currentSample - this.speechPadSamples, 0);
+            console.log(`Speech start detected at sample ${speechStart}`);
+            if (returnSeconds) {
+                const speechStartSeconds = speechStart / this.samplingRate;
+                return { start: Number(speechStartSeconds.toFixed(1)) };
+            } else {
+                return { start: speechStart };
+            }
+        }
+        if (speechProb < this.endThreshold && this.triggered) {
+            console.log(`Potential speech end at sample ${this.currentSample}`);
+            if (this.tempEnd === 0) {
+                this.tempEnd = this.currentSample;
+            }
+            if (this.currentSample - this.tempEnd < this.minSilenceSamples) {
+                console.log('Silence duration too short, continuing');
+                return {};
+            } else {
+                const speechEnd = this.tempEnd + this.speechPadSamples;
+                console.log(`Speech end confirmed at sample ${speechEnd}`);
+                this.tempEnd = 0;
+                this.triggered = false;
+                if (returnSeconds) {
+                    const speechEndSeconds = speechEnd / this.samplingRate;
+                    return { end: Number(speechEndSeconds.toFixed(1)) };
+                } else {
+                    return { end: speechEnd };
+                }
+            }
+        }
+        return {};
+    }
+    async close() {
+        this.reset();
+        await this.model.close();
+    }
+}

globals.css ADDED Viewed

	@@ -0,0 +1,33 @@

+@tailwind base;
+@tailwind components;
+@tailwind utilities;
+:root {
+  --foreground-rgb: 0, 0, 0;
+  --background-start-rgb: 214, 219, 220;
+  --background-end-rgb: 255, 255, 255;
+}
+@media (prefers-color-scheme: dark) {
+  :root {
+    --foreground-rgb: 255, 255, 255;
+    --background-start-rgb: 0, 0, 0;
+    --background-end-rgb: 0, 0, 0;
+  }
+}
+body {
+  color: rgb(var(--foreground-rgb));
+  background: linear-gradient(
+      to bottom,
+      transparent,
+      rgb(var(--background-end-rgb))
+    )
+    rgb(var(--background-start-rgb));
+}
+@layer utilities {
+  .text-balance {
+    text-wrap: balance;
+  }
+}

index.html ADDED Viewed

	@@ -0,0 +1,64 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Voice Activity Detection Demo</title>
+    <script src="https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/ort.min.js"></script>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <script src="https://unpkg.com/@babel/standalone/babel.min.js"></script>
+    <script>
+        ort.env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/';
+    </script>
+</head>
+<body>
+    <main class="flex min-h-screen flex-col items-center justify-between p-24">
+        <div class="text-center">
+            <div id="status" class="text-4xl mb-4">🔇 Not Listening</div>
+            <div id="audioList" class="space-y-4"></div>
+        </div>
+    </main>
+    <script type="module">
+        import { SpeechChunks } from './SpeechChunks.ts';
+        let speechChunks;
+        function updateStatus(isListening) {
+            document.getElementById('status').textContent = isListening ? "🎙️ Listening..." : "🔇 Not Listening";
+        }
+        function addAudioToList(blob) {
+            const audioList = document.getElementById('audioList');
+            const audio = document.createElement('audio');
+            audio.controls = true;
+            audio.src = URL.createObjectURL(blob);
+            audio.onended = () => URL.revokeObjectURL(audio.src);
+            audioList.appendChild(audio);
+        }
+        async function initializeSpeechChunks() {
+            try {
+                speechChunks = new SpeechChunks(
+                    () => {
+                        console.log("speech start");
+                        updateStatus(true);
+                    },
+                    (blob) => {
+                        console.log("speech end");
+                        updateStatus(false);
+                        addAudioToList(blob);
+                    }
+                );
+                await speechChunks.start();
+            } catch (error) {
+                console.error("Error initializing SpeechChunks:", error);
+                updateStatus(false);
+                document.getElementById('status').textContent = "Error: " + error.message;
+            }
+        }
+        initializeSpeechChunks();
+    </script>
+</body>
+</html>