Spaces:
Running
Running
Upload 6 files
Browse files- MicrophoneAudio.ts +119 -0
- Silero.ts +155 -0
- SpeechChunks.ts +126 -0
- VoiceActivityDetector.ts +109 -0
- globals.css +33 -0
- index.html +64 -0
MicrophoneAudio.ts
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class MicrophoneAudio {
|
2 |
+
constructor(options) {
|
3 |
+
console.log('Initializing MicrophoneAudio');
|
4 |
+
this.options = {
|
5 |
+
sampleRate: 16000,
|
6 |
+
channels: 1,
|
7 |
+
...options,
|
8 |
+
};
|
9 |
+
this.stream = null;
|
10 |
+
this.audioContext = null;
|
11 |
+
this.sourceNode = null;
|
12 |
+
this.workletNode = null;
|
13 |
+
this.buffer = new Float32Array();
|
14 |
+
console.log(`MicrophoneAudio options: ${JSON.stringify(this.options)}`);
|
15 |
+
}
|
16 |
+
|
17 |
+
getDeviceId() {
|
18 |
+
console.log('Getting device ID');
|
19 |
+
return navigator.mediaDevices.getUserMedia({ audio: true }).then((stream) => {
|
20 |
+
const deviceId = stream.getTracks()[0].getSettings().deviceId;
|
21 |
+
console.log("The device Id is", deviceId);
|
22 |
+
return deviceId;
|
23 |
+
});
|
24 |
+
}
|
25 |
+
|
26 |
+
async start() {
|
27 |
+
console.log('Starting MicrophoneAudio');
|
28 |
+
try {
|
29 |
+
this.stream = await navigator.mediaDevices.getUserMedia({
|
30 |
+
audio: {
|
31 |
+
sampleRate: this.options.sampleRate,
|
32 |
+
channelCount: this.options.channels,
|
33 |
+
},
|
34 |
+
});
|
35 |
+
console.log('MediaStream acquired');
|
36 |
+
|
37 |
+
this.getDeviceId().then((deviceId) => {
|
38 |
+
console.log("The device Id is", deviceId);
|
39 |
+
});
|
40 |
+
this.audioContext = new AudioContext({
|
41 |
+
sampleRate: this.options.sampleRate,
|
42 |
+
});
|
43 |
+
|
44 |
+
await this.audioContext.audioWorklet.addModule(
|
45 |
+
URL.createObjectURL(new Blob([`
|
46 |
+
class AudioProcessor extends AudioWorkletProcessor {
|
47 |
+
constructor() {
|
48 |
+
super();
|
49 |
+
this.buffer = new Float32Array();
|
50 |
+
}
|
51 |
+
|
52 |
+
process(inputs, outputs, parameters) {
|
53 |
+
const input = inputs[0];
|
54 |
+
const channelData = input[0];
|
55 |
+
|
56 |
+
this.buffer = Float32Array.from([...this.buffer, ...channelData]);
|
57 |
+
|
58 |
+
while (this.buffer.length >= ${this.options.windowSizeSamples}) {
|
59 |
+
const chunk = this.buffer.slice(0, ${this.options.windowSizeSamples});
|
60 |
+
this.port.postMessage(chunk);
|
61 |
+
this.buffer = this.buffer.slice(${this.options.windowSizeSamples});
|
62 |
+
}
|
63 |
+
|
64 |
+
return true;
|
65 |
+
}
|
66 |
+
}
|
67 |
+
|
68 |
+
registerProcessor('audio-processor', AudioProcessor);
|
69 |
+
`], { type: 'application/javascript' }))
|
70 |
+
);
|
71 |
+
|
72 |
+
this.sourceNode = this.audioContext.createMediaStreamSource(this.stream);
|
73 |
+
this.workletNode = new AudioWorkletNode(this.audioContext, 'audio-processor');
|
74 |
+
|
75 |
+
this.workletNode.port.onmessage = (event) => {
|
76 |
+
this.options.onAudioData(event.data);
|
77 |
+
};
|
78 |
+
|
79 |
+
this.sourceNode.connect(this.workletNode);
|
80 |
+
this.workletNode.connect(this.audioContext.destination);
|
81 |
+
console.log('AudioWorklet added and connected');
|
82 |
+
} catch (error) {
|
83 |
+
console.error('Error starting microphone:', error);
|
84 |
+
throw error;
|
85 |
+
}
|
86 |
+
}
|
87 |
+
|
88 |
+
stop() {
|
89 |
+
console.log('Stopping MicrophoneAudio');
|
90 |
+
if (this.workletNode) {
|
91 |
+
this.workletNode.port.postMessage('flush');
|
92 |
+
this.workletNode.disconnect();
|
93 |
+
this.workletNode = null;
|
94 |
+
}
|
95 |
+
|
96 |
+
if (this.sourceNode) {
|
97 |
+
this.sourceNode.disconnect();
|
98 |
+
this.sourceNode = null;
|
99 |
+
}
|
100 |
+
|
101 |
+
if (this.audioContext) {
|
102 |
+
this.audioContext.close();
|
103 |
+
this.audioContext = null;
|
104 |
+
}
|
105 |
+
|
106 |
+
if (this.stream) {
|
107 |
+
this.stream.getTracks().forEach((track) => track.stop());
|
108 |
+
this.stream = null;
|
109 |
+
}
|
110 |
+
|
111 |
+
if (this.buffer.length > 0) {
|
112 |
+
this.options.onAudioData(this.buffer);
|
113 |
+
this.buffer = new Float32Array();
|
114 |
+
}
|
115 |
+
console.log('MicrophoneAudio stopped');
|
116 |
+
}
|
117 |
+
}
|
118 |
+
|
119 |
+
export default MicrophoneAudio;
|
Silero.ts
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class OnnxWrapper {
|
2 |
+
constructor(path, force_onnx_cpu = true) {
|
3 |
+
console.log(`Initializing OnnxWrapper with path: ${path}`);
|
4 |
+
this.sessionReady = this.initSession(path, force_onnx_cpu);
|
5 |
+
this.resetStates();
|
6 |
+
this.sample_rates = [8000, 16000];
|
7 |
+
}
|
8 |
+
|
9 |
+
async ready() {
|
10 |
+
console.log('Waiting for OnnxWrapper session to be ready');
|
11 |
+
await this.sessionReady;
|
12 |
+
console.log('OnnxWrapper session is ready');
|
13 |
+
}
|
14 |
+
|
15 |
+
async initSession(path, force_onnx_cpu) {
|
16 |
+
console.log(`Initializing ONNX session with force_onnx_cpu: ${force_onnx_cpu}`);
|
17 |
+
const options = {
|
18 |
+
executionProviders: force_onnx_cpu ? ['wasm'] : ['webgl', 'wasm'],
|
19 |
+
graphOptimizationLevel: 'all',
|
20 |
+
executionMode: 'sequential',
|
21 |
+
enableCpuMemArena: true,
|
22 |
+
enableMemPattern: true,
|
23 |
+
extra: {
|
24 |
+
session: {
|
25 |
+
intra_op_num_threads: 1,
|
26 |
+
inter_op_num_threads: 1,
|
27 |
+
}
|
28 |
+
}
|
29 |
+
};
|
30 |
+
|
31 |
+
this.session = await ort.InferenceSession.create(path, options);
|
32 |
+
console.log('ONNX session created successfully');
|
33 |
+
}
|
34 |
+
|
35 |
+
_validate_input(x, sr) {
|
36 |
+
if (!Array.isArray(x[0])) {
|
37 |
+
x = [x];
|
38 |
+
}
|
39 |
+
if (x.length > 2) {
|
40 |
+
throw new Error(`Too many dimensions for input audio chunk ${x.length}`);
|
41 |
+
}
|
42 |
+
if (sr !== 16000 && (sr % 16000 === 0)) {
|
43 |
+
const step = Math.floor(sr / 16000);
|
44 |
+
x = x.map(row => row.filter((_, i) => i % step === 0));
|
45 |
+
sr = 16000;
|
46 |
+
}
|
47 |
+
if (!this.sample_rates.includes(sr)) {
|
48 |
+
throw new Error(`Supported sampling rates: ${this.sample_rates} (or multiply of 16000)`);
|
49 |
+
}
|
50 |
+
if (sr / x[0].length > 31.25) {
|
51 |
+
throw new Error("Input audio chunk is too short");
|
52 |
+
}
|
53 |
+
return [x, sr];
|
54 |
+
}
|
55 |
+
|
56 |
+
resetStates(batch_size = 1) {
|
57 |
+
console.log(`Resetting states with batch_size: ${batch_size}`);
|
58 |
+
this._state = Array(2).fill(0).map(() => Array(batch_size * 128).fill(0));
|
59 |
+
this._context = [];
|
60 |
+
this._last_sr = 0;
|
61 |
+
this._last_batch_size = 0;
|
62 |
+
}
|
63 |
+
|
64 |
+
async call(x, sr) {
|
65 |
+
console.log(`Calling model with input shape: [${x.length}, ${x[0].length}], sample rate: ${sr}`);
|
66 |
+
await this.ready();
|
67 |
+
[x, sr] = this._validate_input(x, sr);
|
68 |
+
const num_samples = sr === 16000 ? 512 : 256;
|
69 |
+
|
70 |
+
if (x[0].length !== num_samples) {
|
71 |
+
throw new Error(`Provided number of samples is ${x[0].length} (Supported values: 256 for 8000 sample rate, 512 for 16000)`);
|
72 |
+
}
|
73 |
+
|
74 |
+
const batch_size = x.length;
|
75 |
+
const context_size = sr === 16000 ? 64 : 32;
|
76 |
+
|
77 |
+
if (!this._last_batch_size) {
|
78 |
+
this.resetStates(batch_size);
|
79 |
+
}
|
80 |
+
if (this._last_sr && this._last_sr !== sr) {
|
81 |
+
this.resetStates(batch_size);
|
82 |
+
}
|
83 |
+
if (this._last_batch_size && this._last_batch_size !== batch_size) {
|
84 |
+
this.resetStates(batch_size);
|
85 |
+
}
|
86 |
+
if (this._context.length === 0) {
|
87 |
+
this._context = Array(batch_size * context_size).fill(0);
|
88 |
+
}
|
89 |
+
|
90 |
+
x = x.map((row, i) => [...this._context.slice(i * context_size, (i + 1) * context_size), ...row]);
|
91 |
+
|
92 |
+
if (sr === 8000 || sr === 16000) {
|
93 |
+
const inputTensor = new ort.Tensor('float32', x.flat(), [batch_size, x[0].length]);
|
94 |
+
const stateTensor = new ort.Tensor('float32', this._state.flat(), [2, batch_size, 128]);
|
95 |
+
const srTensor = new ort.Tensor('int64', [sr], []);
|
96 |
+
|
97 |
+
const feeds = {
|
98 |
+
input: inputTensor,
|
99 |
+
state: stateTensor,
|
100 |
+
sr: srTensor
|
101 |
+
};
|
102 |
+
|
103 |
+
const results = await this.session.run(feeds);
|
104 |
+
const outputData = results.output.data;
|
105 |
+
const stateData = results.stateN.data;
|
106 |
+
|
107 |
+
this._state = Array(2).fill(0).map((_, i) =>
|
108 |
+
Array.from(stateData.slice(i * batch_size * 128, (i + 1) * batch_size * 128))
|
109 |
+
);
|
110 |
+
|
111 |
+
const outputShape = results.output.dims;
|
112 |
+
const out = Array(outputShape[0]).fill(0).map((_, i) =>
|
113 |
+
Array.from(outputData.slice(i * outputShape[1], (i + 1) * outputShape[1]))
|
114 |
+
);
|
115 |
+
|
116 |
+
this._context = x.map(row => row.slice(-context_size)).flat();
|
117 |
+
this._last_sr = sr;
|
118 |
+
this._last_batch_size = batch_size;
|
119 |
+
|
120 |
+
console.log(`Model call completed, output shape: [${out.length}, ${out[0].length}]`);
|
121 |
+
return out;
|
122 |
+
} else {
|
123 |
+
throw new Error(`Unsupported sample rate: ${sr}. Supported rates are 8000 and 16000.`);
|
124 |
+
}
|
125 |
+
}
|
126 |
+
|
127 |
+
async audio_forward(x, sr) {
|
128 |
+
console.log(`Running audio_forward with input shape: [${x.length}, ${x[0].length}], sample rate: ${sr}`);
|
129 |
+
const outs = [];
|
130 |
+
[x, sr] = this._validate_input(x, sr);
|
131 |
+
this.resetStates();
|
132 |
+
const num_samples = sr === 16000 ? 512 : 256;
|
133 |
+
|
134 |
+
if (x[0].length % num_samples !== 0) {
|
135 |
+
const pad_num = num_samples - (x[0].length % num_samples);
|
136 |
+
x = x.map(row => [...row, ...Array(pad_num).fill(0)]);
|
137 |
+
}
|
138 |
+
|
139 |
+
for (let i = 0; i < x[0].length; i += num_samples) {
|
140 |
+
const wavs_batch = x.map(row => row.slice(i, i + num_samples));
|
141 |
+
const out_chunk = await this.call(wavs_batch, sr);
|
142 |
+
outs.push(out_chunk);
|
143 |
+
}
|
144 |
+
|
145 |
+
console.log(`audio_forward completed, output shape: [${outs.length}, ${outs[0].length}]`);
|
146 |
+
return outs.reduce((acc, curr) => acc.map((row, i) => [...row, ...curr[i]]));
|
147 |
+
}
|
148 |
+
|
149 |
+
close() {
|
150 |
+
console.log('Closing OnnxWrapper session');
|
151 |
+
this.session.release();
|
152 |
+
}
|
153 |
+
}
|
154 |
+
|
155 |
+
export default OnnxWrapper;
|
SpeechChunks.ts
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import MicrophoneAudio from './MicrophoneAudio.ts';
|
2 |
+
import { VadDetector } from './VoiceActivityDetector.ts';
|
3 |
+
|
4 |
+
export class SpeechChunks {
|
5 |
+
static SAMPLE_RATE = 16000;
|
6 |
+
static START_THRESHOLD = 0.6;
|
7 |
+
static END_THRESHOLD = 0.45;
|
8 |
+
static MIN_SILENCE_DURATION_MS = 600;
|
9 |
+
static SPEECH_PAD_MS = 500;
|
10 |
+
static WINDOW_SIZE_SAMPLES = 512;
|
11 |
+
|
12 |
+
constructor(onSpeechStart, onSpeechEnd) {
|
13 |
+
this.chunks = [];
|
14 |
+
this.isSpeechActive = false;
|
15 |
+
this.onSpeechStart = onSpeechStart;
|
16 |
+
this.onSpeechEnd = onSpeechEnd;
|
17 |
+
console.log('SpeechChunks initialized');
|
18 |
+
}
|
19 |
+
|
20 |
+
async initialize() {
|
21 |
+
this.microphoneAudio = new MicrophoneAudio({
|
22 |
+
sampleRate: SpeechChunks.SAMPLE_RATE,
|
23 |
+
windowSizeSamples: SpeechChunks.WINDOW_SIZE_SAMPLES,
|
24 |
+
onAudioData: this.processAudioData.bind(this)
|
25 |
+
});
|
26 |
+
|
27 |
+
this.vadDetector = new VadDetector(
|
28 |
+
SpeechChunks.START_THRESHOLD,
|
29 |
+
SpeechChunks.END_THRESHOLD,
|
30 |
+
SpeechChunks.SAMPLE_RATE,
|
31 |
+
SpeechChunks.MIN_SILENCE_DURATION_MS,
|
32 |
+
SpeechChunks.SPEECH_PAD_MS
|
33 |
+
);
|
34 |
+
}
|
35 |
+
|
36 |
+
async processAudioData(audioData) {
|
37 |
+
console.log(`Processing audio data of length ${audioData.length}`);
|
38 |
+
try {
|
39 |
+
const result = await this.vadDetector.apply(audioData, false);
|
40 |
+
if (result.start !== undefined) {
|
41 |
+
this.isSpeechActive = true;
|
42 |
+
console.log('Speech start detected');
|
43 |
+
this.onSpeechStart();
|
44 |
+
} else if (result.end !== undefined) {
|
45 |
+
this.isSpeechActive = false;
|
46 |
+
console.log('Speech end detected');
|
47 |
+
this.onSpeechEnd(this.getBlob());
|
48 |
+
}
|
49 |
+
if (this.isSpeechActive) {
|
50 |
+
console.log('Adding chunk to speech');
|
51 |
+
this.chunks.push(Array.from(audioData));
|
52 |
+
}
|
53 |
+
} catch (error) {
|
54 |
+
console.error('Error processing audio data', error);
|
55 |
+
}
|
56 |
+
}
|
57 |
+
|
58 |
+
async start() {
|
59 |
+
console.log('Starting SpeechChunks');
|
60 |
+
await this.initialize();
|
61 |
+
await this.microphoneAudio.start();
|
62 |
+
}
|
63 |
+
|
64 |
+
stop() {
|
65 |
+
console.log('Stopping SpeechChunks');
|
66 |
+
this.microphoneAudio.stop();
|
67 |
+
this.vadDetector.reset();
|
68 |
+
this.isSpeechActive = false;
|
69 |
+
}
|
70 |
+
|
71 |
+
getSpeechChunks() {
|
72 |
+
console.log(`Returning ${this.chunks.length} speech chunks`);
|
73 |
+
const speechChunks = this.chunks;
|
74 |
+
this.chunks = [];
|
75 |
+
return speechChunks;
|
76 |
+
}
|
77 |
+
|
78 |
+
getBlob() {
|
79 |
+
console.log('Creating audio blob from speech chunks');
|
80 |
+
const combinedChunks = this.chunks.flat();
|
81 |
+
const combinedAudio = new Float32Array(combinedChunks);
|
82 |
+
|
83 |
+
const intData = new Int16Array(combinedAudio.length);
|
84 |
+
for (let i = 0; i < combinedAudio.length; i++) {
|
85 |
+
const s = Math.max(-1, Math.min(1, combinedAudio[i]));
|
86 |
+
intData[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
|
87 |
+
}
|
88 |
+
|
89 |
+
const buffer = new ArrayBuffer(44 + intData.length * 2);
|
90 |
+
const view = new DataView(buffer);
|
91 |
+
|
92 |
+
this.writeString(view, 0, 'RIFF');
|
93 |
+
view.setUint32(4, 36 + intData.length * 2, true);
|
94 |
+
this.writeString(view, 8, 'WAVE');
|
95 |
+
this.writeString(view, 12, 'fmt ');
|
96 |
+
view.setUint32(16, 16, true);
|
97 |
+
view.setUint16(20, 1, true);
|
98 |
+
view.setUint16(22, 1, true);
|
99 |
+
view.setUint32(24, SpeechChunks.SAMPLE_RATE, true);
|
100 |
+
view.setUint32(28, SpeechChunks.SAMPLE_RATE * 2, true);
|
101 |
+
view.setUint16(32, 2, true);
|
102 |
+
view.setUint16(34, 16, true);
|
103 |
+
this.writeString(view, 36, 'data');
|
104 |
+
view.setUint32(40, intData.length * 2, true);
|
105 |
+
|
106 |
+
for (let i = 0; i < intData.length; i++) {
|
107 |
+
view.setInt16(44 + i * 2, intData[i], true);
|
108 |
+
}
|
109 |
+
|
110 |
+
const blob = new Blob([buffer], { type: 'audio/wav' });
|
111 |
+
console.log(`Created blob of size ${blob.size} bytes`);
|
112 |
+
return blob;
|
113 |
+
}
|
114 |
+
|
115 |
+
writeString(view, offset, string) {
|
116 |
+
for (let i = 0; i < string.length; i++) {
|
117 |
+
view.setUint8(offset + i, string.charCodeAt(i));
|
118 |
+
}
|
119 |
+
}
|
120 |
+
|
121 |
+
async close() {
|
122 |
+
console.log('Closing SpeechChunks');
|
123 |
+
this.stop();
|
124 |
+
await this.vadDetector.close();
|
125 |
+
}
|
126 |
+
}
|
VoiceActivityDetector.ts
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import OnnxWrapper from './Silero.ts';
|
2 |
+
|
3 |
+
const modelPath = "silero_vad.onnx"; // Make sure this path is correct
|
4 |
+
|
5 |
+
export class VadDetector {
|
6 |
+
constructor(startThreshold, endThreshold, samplingRate, minSilenceDurationMs, speechPadMs) {
|
7 |
+
if (samplingRate !== 8000 && samplingRate !== 16000) {
|
8 |
+
throw new Error("Does not support sampling rates other than [8000, 16000]");
|
9 |
+
}
|
10 |
+
|
11 |
+
this.model = new OnnxWrapper(modelPath);
|
12 |
+
this.startThreshold = startThreshold;
|
13 |
+
this.endThreshold = endThreshold;
|
14 |
+
this.samplingRate = samplingRate;
|
15 |
+
this.minSilenceSamples = samplingRate * minSilenceDurationMs / 1000;
|
16 |
+
this.speechPadSamples = samplingRate * speechPadMs / 1000;
|
17 |
+
this.reset();
|
18 |
+
console.log(`VadDetector initialized with: startThreshold=${startThreshold}, endThreshold=${endThreshold}, samplingRate=${samplingRate}`);
|
19 |
+
}
|
20 |
+
|
21 |
+
reset() {
|
22 |
+
this.model.resetStates();
|
23 |
+
this.triggered = false;
|
24 |
+
this.tempEnd = 0;
|
25 |
+
this.currentSample = 0;
|
26 |
+
console.log('VadDetector reset');
|
27 |
+
}
|
28 |
+
|
29 |
+
async apply(data, returnSeconds) {
|
30 |
+
console.log(`Applying VAD to data of length ${data.length}`);
|
31 |
+
const windowSizeSamples = data.length;
|
32 |
+
this.currentSample += windowSizeSamples;
|
33 |
+
|
34 |
+
const rowLength = this.samplingRate === 16000 ? 512 : 256;
|
35 |
+
|
36 |
+
// Ensure data is the correct length
|
37 |
+
if (data.length < rowLength) {
|
38 |
+
console.warn(`Input data length (${data.length}) is less than required (${rowLength}). Padding with zeros.`);
|
39 |
+
data = [...data, ...new Array(rowLength - data.length).fill(0)];
|
40 |
+
} else if (data.length > rowLength) {
|
41 |
+
console.warn(`Input data length (${data.length}) is greater than required (${rowLength}). Truncating.`);
|
42 |
+
data = data.slice(0, rowLength);
|
43 |
+
}
|
44 |
+
|
45 |
+
const x = [Array.from(data)];
|
46 |
+
|
47 |
+
let speechProb;
|
48 |
+
try {
|
49 |
+
console.log(`Calling model with input shape: [${x.length}, ${x[0].length}], sample rate: ${this.samplingRate}`);
|
50 |
+
const result = await this.model.call(x, this.samplingRate);
|
51 |
+
if (result && Array.isArray(result) && result[0] && result[0][0] !== undefined) {
|
52 |
+
speechProb = result[0][0];
|
53 |
+
console.log(`Speech probability: ${speechProb}`);
|
54 |
+
} else {
|
55 |
+
throw new Error("Unexpected response from model");
|
56 |
+
}
|
57 |
+
} catch (e) {
|
58 |
+
console.error("Error in VadDetector.apply:", e);
|
59 |
+
throw new Error("Error calling the model: " + e);
|
60 |
+
}
|
61 |
+
|
62 |
+
if (speechProb >= this.startThreshold && this.tempEnd !== 0) {
|
63 |
+
this.tempEnd = 0;
|
64 |
+
}
|
65 |
+
|
66 |
+
if (speechProb >= this.startThreshold && !this.triggered) {
|
67 |
+
this.triggered = true;
|
68 |
+
let speechStart = Math.max(this.currentSample - this.speechPadSamples, 0);
|
69 |
+
console.log(`Speech start detected at sample ${speechStart}`);
|
70 |
+
if (returnSeconds) {
|
71 |
+
const speechStartSeconds = speechStart / this.samplingRate;
|
72 |
+
return { start: Number(speechStartSeconds.toFixed(1)) };
|
73 |
+
} else {
|
74 |
+
return { start: speechStart };
|
75 |
+
}
|
76 |
+
}
|
77 |
+
|
78 |
+
if (speechProb < this.endThreshold && this.triggered) {
|
79 |
+
console.log(`Potential speech end at sample ${this.currentSample}`);
|
80 |
+
if (this.tempEnd === 0) {
|
81 |
+
this.tempEnd = this.currentSample;
|
82 |
+
}
|
83 |
+
|
84 |
+
if (this.currentSample - this.tempEnd < this.minSilenceSamples) {
|
85 |
+
console.log('Silence duration too short, continuing');
|
86 |
+
return {};
|
87 |
+
} else {
|
88 |
+
const speechEnd = this.tempEnd + this.speechPadSamples;
|
89 |
+
console.log(`Speech end confirmed at sample ${speechEnd}`);
|
90 |
+
this.tempEnd = 0;
|
91 |
+
this.triggered = false;
|
92 |
+
|
93 |
+
if (returnSeconds) {
|
94 |
+
const speechEndSeconds = speechEnd / this.samplingRate;
|
95 |
+
return { end: Number(speechEndSeconds.toFixed(1)) };
|
96 |
+
} else {
|
97 |
+
return { end: speechEnd };
|
98 |
+
}
|
99 |
+
}
|
100 |
+
}
|
101 |
+
|
102 |
+
return {};
|
103 |
+
}
|
104 |
+
|
105 |
+
async close() {
|
106 |
+
this.reset();
|
107 |
+
await this.model.close();
|
108 |
+
}
|
109 |
+
}
|
globals.css
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@tailwind base;
|
2 |
+
@tailwind components;
|
3 |
+
@tailwind utilities;
|
4 |
+
|
5 |
+
:root {
|
6 |
+
--foreground-rgb: 0, 0, 0;
|
7 |
+
--background-start-rgb: 214, 219, 220;
|
8 |
+
--background-end-rgb: 255, 255, 255;
|
9 |
+
}
|
10 |
+
|
11 |
+
@media (prefers-color-scheme: dark) {
|
12 |
+
:root {
|
13 |
+
--foreground-rgb: 255, 255, 255;
|
14 |
+
--background-start-rgb: 0, 0, 0;
|
15 |
+
--background-end-rgb: 0, 0, 0;
|
16 |
+
}
|
17 |
+
}
|
18 |
+
|
19 |
+
body {
|
20 |
+
color: rgb(var(--foreground-rgb));
|
21 |
+
background: linear-gradient(
|
22 |
+
to bottom,
|
23 |
+
transparent,
|
24 |
+
rgb(var(--background-end-rgb))
|
25 |
+
)
|
26 |
+
rgb(var(--background-start-rgb));
|
27 |
+
}
|
28 |
+
|
29 |
+
@layer utilities {
|
30 |
+
.text-balance {
|
31 |
+
text-wrap: balance;
|
32 |
+
}
|
33 |
+
}
|
index.html
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>Voice Activity Detection Demo</title>
|
7 |
+
<script src="https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/ort.min.js"></script>
|
8 |
+
<script src="https://cdn.tailwindcss.com"></script>
|
9 |
+
<script src="https://unpkg.com/@babel/standalone/babel.min.js"></script>
|
10 |
+
<script>
|
11 |
+
ort.env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/';
|
12 |
+
</script>
|
13 |
+
</head>
|
14 |
+
<body>
|
15 |
+
<main class="flex min-h-screen flex-col items-center justify-between p-24">
|
16 |
+
<div class="text-center">
|
17 |
+
<div id="status" class="text-4xl mb-4">π Not Listening</div>
|
18 |
+
<div id="audioList" class="space-y-4"></div>
|
19 |
+
</div>
|
20 |
+
</main>
|
21 |
+
|
22 |
+
<script type="module">
|
23 |
+
import { SpeechChunks } from './SpeechChunks.ts';
|
24 |
+
|
25 |
+
let speechChunks;
|
26 |
+
|
27 |
+
function updateStatus(isListening) {
|
28 |
+
document.getElementById('status').textContent = isListening ? "ποΈ Listening..." : "π Not Listening";
|
29 |
+
}
|
30 |
+
|
31 |
+
function addAudioToList(blob) {
|
32 |
+
const audioList = document.getElementById('audioList');
|
33 |
+
const audio = document.createElement('audio');
|
34 |
+
audio.controls = true;
|
35 |
+
audio.src = URL.createObjectURL(blob);
|
36 |
+
audio.onended = () => URL.revokeObjectURL(audio.src);
|
37 |
+
audioList.appendChild(audio);
|
38 |
+
}
|
39 |
+
|
40 |
+
async function initializeSpeechChunks() {
|
41 |
+
try {
|
42 |
+
speechChunks = new SpeechChunks(
|
43 |
+
() => {
|
44 |
+
console.log("speech start");
|
45 |
+
updateStatus(true);
|
46 |
+
},
|
47 |
+
(blob) => {
|
48 |
+
console.log("speech end");
|
49 |
+
updateStatus(false);
|
50 |
+
addAudioToList(blob);
|
51 |
+
}
|
52 |
+
);
|
53 |
+
await speechChunks.start();
|
54 |
+
} catch (error) {
|
55 |
+
console.error("Error initializing SpeechChunks:", error);
|
56 |
+
updateStatus(false);
|
57 |
+
document.getElementById('status').textContent = "Error: " + error.message;
|
58 |
+
}
|
59 |
+
}
|
60 |
+
|
61 |
+
initializeSpeechChunks();
|
62 |
+
</script>
|
63 |
+
</body>
|
64 |
+
</html>
|