atlury commited on
Commit
c48a715
Β·
verified Β·
1 Parent(s): 8e5cfea

Upload 6 files

Browse files
Files changed (6) hide show
  1. MicrophoneAudio.ts +119 -0
  2. Silero.ts +155 -0
  3. SpeechChunks.ts +126 -0
  4. VoiceActivityDetector.ts +109 -0
  5. globals.css +33 -0
  6. index.html +64 -0
MicrophoneAudio.ts ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class MicrophoneAudio {
2
+ constructor(options) {
3
+ console.log('Initializing MicrophoneAudio');
4
+ this.options = {
5
+ sampleRate: 16000,
6
+ channels: 1,
7
+ ...options,
8
+ };
9
+ this.stream = null;
10
+ this.audioContext = null;
11
+ this.sourceNode = null;
12
+ this.workletNode = null;
13
+ this.buffer = new Float32Array();
14
+ console.log(`MicrophoneAudio options: ${JSON.stringify(this.options)}`);
15
+ }
16
+
17
+ getDeviceId() {
18
+ console.log('Getting device ID');
19
+ return navigator.mediaDevices.getUserMedia({ audio: true }).then((stream) => {
20
+ const deviceId = stream.getTracks()[0].getSettings().deviceId;
21
+ console.log("The device Id is", deviceId);
22
+ return deviceId;
23
+ });
24
+ }
25
+
26
+ async start() {
27
+ console.log('Starting MicrophoneAudio');
28
+ try {
29
+ this.stream = await navigator.mediaDevices.getUserMedia({
30
+ audio: {
31
+ sampleRate: this.options.sampleRate,
32
+ channelCount: this.options.channels,
33
+ },
34
+ });
35
+ console.log('MediaStream acquired');
36
+
37
+ this.getDeviceId().then((deviceId) => {
38
+ console.log("The device Id is", deviceId);
39
+ });
40
+ this.audioContext = new AudioContext({
41
+ sampleRate: this.options.sampleRate,
42
+ });
43
+
44
+ await this.audioContext.audioWorklet.addModule(
45
+ URL.createObjectURL(new Blob([`
46
+ class AudioProcessor extends AudioWorkletProcessor {
47
+ constructor() {
48
+ super();
49
+ this.buffer = new Float32Array();
50
+ }
51
+
52
+ process(inputs, outputs, parameters) {
53
+ const input = inputs[0];
54
+ const channelData = input[0];
55
+
56
+ this.buffer = Float32Array.from([...this.buffer, ...channelData]);
57
+
58
+ while (this.buffer.length >= ${this.options.windowSizeSamples}) {
59
+ const chunk = this.buffer.slice(0, ${this.options.windowSizeSamples});
60
+ this.port.postMessage(chunk);
61
+ this.buffer = this.buffer.slice(${this.options.windowSizeSamples});
62
+ }
63
+
64
+ return true;
65
+ }
66
+ }
67
+
68
+ registerProcessor('audio-processor', AudioProcessor);
69
+ `], { type: 'application/javascript' }))
70
+ );
71
+
72
+ this.sourceNode = this.audioContext.createMediaStreamSource(this.stream);
73
+ this.workletNode = new AudioWorkletNode(this.audioContext, 'audio-processor');
74
+
75
+ this.workletNode.port.onmessage = (event) => {
76
+ this.options.onAudioData(event.data);
77
+ };
78
+
79
+ this.sourceNode.connect(this.workletNode);
80
+ this.workletNode.connect(this.audioContext.destination);
81
+ console.log('AudioWorklet added and connected');
82
+ } catch (error) {
83
+ console.error('Error starting microphone:', error);
84
+ throw error;
85
+ }
86
+ }
87
+
88
+ stop() {
89
+ console.log('Stopping MicrophoneAudio');
90
+ if (this.workletNode) {
91
+ this.workletNode.port.postMessage('flush');
92
+ this.workletNode.disconnect();
93
+ this.workletNode = null;
94
+ }
95
+
96
+ if (this.sourceNode) {
97
+ this.sourceNode.disconnect();
98
+ this.sourceNode = null;
99
+ }
100
+
101
+ if (this.audioContext) {
102
+ this.audioContext.close();
103
+ this.audioContext = null;
104
+ }
105
+
106
+ if (this.stream) {
107
+ this.stream.getTracks().forEach((track) => track.stop());
108
+ this.stream = null;
109
+ }
110
+
111
+ if (this.buffer.length > 0) {
112
+ this.options.onAudioData(this.buffer);
113
+ this.buffer = new Float32Array();
114
+ }
115
+ console.log('MicrophoneAudio stopped');
116
+ }
117
+ }
118
+
119
+ export default MicrophoneAudio;
Silero.ts ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class OnnxWrapper {
2
+ constructor(path, force_onnx_cpu = true) {
3
+ console.log(`Initializing OnnxWrapper with path: ${path}`);
4
+ this.sessionReady = this.initSession(path, force_onnx_cpu);
5
+ this.resetStates();
6
+ this.sample_rates = [8000, 16000];
7
+ }
8
+
9
+ async ready() {
10
+ console.log('Waiting for OnnxWrapper session to be ready');
11
+ await this.sessionReady;
12
+ console.log('OnnxWrapper session is ready');
13
+ }
14
+
15
+ async initSession(path, force_onnx_cpu) {
16
+ console.log(`Initializing ONNX session with force_onnx_cpu: ${force_onnx_cpu}`);
17
+ const options = {
18
+ executionProviders: force_onnx_cpu ? ['wasm'] : ['webgl', 'wasm'],
19
+ graphOptimizationLevel: 'all',
20
+ executionMode: 'sequential',
21
+ enableCpuMemArena: true,
22
+ enableMemPattern: true,
23
+ extra: {
24
+ session: {
25
+ intra_op_num_threads: 1,
26
+ inter_op_num_threads: 1,
27
+ }
28
+ }
29
+ };
30
+
31
+ this.session = await ort.InferenceSession.create(path, options);
32
+ console.log('ONNX session created successfully');
33
+ }
34
+
35
+ _validate_input(x, sr) {
36
+ if (!Array.isArray(x[0])) {
37
+ x = [x];
38
+ }
39
+ if (x.length > 2) {
40
+ throw new Error(`Too many dimensions for input audio chunk ${x.length}`);
41
+ }
42
+ if (sr !== 16000 && (sr % 16000 === 0)) {
43
+ const step = Math.floor(sr / 16000);
44
+ x = x.map(row => row.filter((_, i) => i % step === 0));
45
+ sr = 16000;
46
+ }
47
+ if (!this.sample_rates.includes(sr)) {
48
+ throw new Error(`Supported sampling rates: ${this.sample_rates} (or multiply of 16000)`);
49
+ }
50
+ if (sr / x[0].length > 31.25) {
51
+ throw new Error("Input audio chunk is too short");
52
+ }
53
+ return [x, sr];
54
+ }
55
+
56
+ resetStates(batch_size = 1) {
57
+ console.log(`Resetting states with batch_size: ${batch_size}`);
58
+ this._state = Array(2).fill(0).map(() => Array(batch_size * 128).fill(0));
59
+ this._context = [];
60
+ this._last_sr = 0;
61
+ this._last_batch_size = 0;
62
+ }
63
+
64
+ async call(x, sr) {
65
+ console.log(`Calling model with input shape: [${x.length}, ${x[0].length}], sample rate: ${sr}`);
66
+ await this.ready();
67
+ [x, sr] = this._validate_input(x, sr);
68
+ const num_samples = sr === 16000 ? 512 : 256;
69
+
70
+ if (x[0].length !== num_samples) {
71
+ throw new Error(`Provided number of samples is ${x[0].length} (Supported values: 256 for 8000 sample rate, 512 for 16000)`);
72
+ }
73
+
74
+ const batch_size = x.length;
75
+ const context_size = sr === 16000 ? 64 : 32;
76
+
77
+ if (!this._last_batch_size) {
78
+ this.resetStates(batch_size);
79
+ }
80
+ if (this._last_sr && this._last_sr !== sr) {
81
+ this.resetStates(batch_size);
82
+ }
83
+ if (this._last_batch_size && this._last_batch_size !== batch_size) {
84
+ this.resetStates(batch_size);
85
+ }
86
+ if (this._context.length === 0) {
87
+ this._context = Array(batch_size * context_size).fill(0);
88
+ }
89
+
90
+ x = x.map((row, i) => [...this._context.slice(i * context_size, (i + 1) * context_size), ...row]);
91
+
92
+ if (sr === 8000 || sr === 16000) {
93
+ const inputTensor = new ort.Tensor('float32', x.flat(), [batch_size, x[0].length]);
94
+ const stateTensor = new ort.Tensor('float32', this._state.flat(), [2, batch_size, 128]);
95
+ const srTensor = new ort.Tensor('int64', [sr], []);
96
+
97
+ const feeds = {
98
+ input: inputTensor,
99
+ state: stateTensor,
100
+ sr: srTensor
101
+ };
102
+
103
+ const results = await this.session.run(feeds);
104
+ const outputData = results.output.data;
105
+ const stateData = results.stateN.data;
106
+
107
+ this._state = Array(2).fill(0).map((_, i) =>
108
+ Array.from(stateData.slice(i * batch_size * 128, (i + 1) * batch_size * 128))
109
+ );
110
+
111
+ const outputShape = results.output.dims;
112
+ const out = Array(outputShape[0]).fill(0).map((_, i) =>
113
+ Array.from(outputData.slice(i * outputShape[1], (i + 1) * outputShape[1]))
114
+ );
115
+
116
+ this._context = x.map(row => row.slice(-context_size)).flat();
117
+ this._last_sr = sr;
118
+ this._last_batch_size = batch_size;
119
+
120
+ console.log(`Model call completed, output shape: [${out.length}, ${out[0].length}]`);
121
+ return out;
122
+ } else {
123
+ throw new Error(`Unsupported sample rate: ${sr}. Supported rates are 8000 and 16000.`);
124
+ }
125
+ }
126
+
127
+ async audio_forward(x, sr) {
128
+ console.log(`Running audio_forward with input shape: [${x.length}, ${x[0].length}], sample rate: ${sr}`);
129
+ const outs = [];
130
+ [x, sr] = this._validate_input(x, sr);
131
+ this.resetStates();
132
+ const num_samples = sr === 16000 ? 512 : 256;
133
+
134
+ if (x[0].length % num_samples !== 0) {
135
+ const pad_num = num_samples - (x[0].length % num_samples);
136
+ x = x.map(row => [...row, ...Array(pad_num).fill(0)]);
137
+ }
138
+
139
+ for (let i = 0; i < x[0].length; i += num_samples) {
140
+ const wavs_batch = x.map(row => row.slice(i, i + num_samples));
141
+ const out_chunk = await this.call(wavs_batch, sr);
142
+ outs.push(out_chunk);
143
+ }
144
+
145
+ console.log(`audio_forward completed, output shape: [${outs.length}, ${outs[0].length}]`);
146
+ return outs.reduce((acc, curr) => acc.map((row, i) => [...row, ...curr[i]]));
147
+ }
148
+
149
+ close() {
150
+ console.log('Closing OnnxWrapper session');
151
+ this.session.release();
152
+ }
153
+ }
154
+
155
+ export default OnnxWrapper;
SpeechChunks.ts ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import MicrophoneAudio from './MicrophoneAudio.ts';
2
+ import { VadDetector } from './VoiceActivityDetector.ts';
3
+
4
+ export class SpeechChunks {
5
+ static SAMPLE_RATE = 16000;
6
+ static START_THRESHOLD = 0.6;
7
+ static END_THRESHOLD = 0.45;
8
+ static MIN_SILENCE_DURATION_MS = 600;
9
+ static SPEECH_PAD_MS = 500;
10
+ static WINDOW_SIZE_SAMPLES = 512;
11
+
12
+ constructor(onSpeechStart, onSpeechEnd) {
13
+ this.chunks = [];
14
+ this.isSpeechActive = false;
15
+ this.onSpeechStart = onSpeechStart;
16
+ this.onSpeechEnd = onSpeechEnd;
17
+ console.log('SpeechChunks initialized');
18
+ }
19
+
20
+ async initialize() {
21
+ this.microphoneAudio = new MicrophoneAudio({
22
+ sampleRate: SpeechChunks.SAMPLE_RATE,
23
+ windowSizeSamples: SpeechChunks.WINDOW_SIZE_SAMPLES,
24
+ onAudioData: this.processAudioData.bind(this)
25
+ });
26
+
27
+ this.vadDetector = new VadDetector(
28
+ SpeechChunks.START_THRESHOLD,
29
+ SpeechChunks.END_THRESHOLD,
30
+ SpeechChunks.SAMPLE_RATE,
31
+ SpeechChunks.MIN_SILENCE_DURATION_MS,
32
+ SpeechChunks.SPEECH_PAD_MS
33
+ );
34
+ }
35
+
36
+ async processAudioData(audioData) {
37
+ console.log(`Processing audio data of length ${audioData.length}`);
38
+ try {
39
+ const result = await this.vadDetector.apply(audioData, false);
40
+ if (result.start !== undefined) {
41
+ this.isSpeechActive = true;
42
+ console.log('Speech start detected');
43
+ this.onSpeechStart();
44
+ } else if (result.end !== undefined) {
45
+ this.isSpeechActive = false;
46
+ console.log('Speech end detected');
47
+ this.onSpeechEnd(this.getBlob());
48
+ }
49
+ if (this.isSpeechActive) {
50
+ console.log('Adding chunk to speech');
51
+ this.chunks.push(Array.from(audioData));
52
+ }
53
+ } catch (error) {
54
+ console.error('Error processing audio data', error);
55
+ }
56
+ }
57
+
58
+ async start() {
59
+ console.log('Starting SpeechChunks');
60
+ await this.initialize();
61
+ await this.microphoneAudio.start();
62
+ }
63
+
64
+ stop() {
65
+ console.log('Stopping SpeechChunks');
66
+ this.microphoneAudio.stop();
67
+ this.vadDetector.reset();
68
+ this.isSpeechActive = false;
69
+ }
70
+
71
+ getSpeechChunks() {
72
+ console.log(`Returning ${this.chunks.length} speech chunks`);
73
+ const speechChunks = this.chunks;
74
+ this.chunks = [];
75
+ return speechChunks;
76
+ }
77
+
78
+ getBlob() {
79
+ console.log('Creating audio blob from speech chunks');
80
+ const combinedChunks = this.chunks.flat();
81
+ const combinedAudio = new Float32Array(combinedChunks);
82
+
83
+ const intData = new Int16Array(combinedAudio.length);
84
+ for (let i = 0; i < combinedAudio.length; i++) {
85
+ const s = Math.max(-1, Math.min(1, combinedAudio[i]));
86
+ intData[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
87
+ }
88
+
89
+ const buffer = new ArrayBuffer(44 + intData.length * 2);
90
+ const view = new DataView(buffer);
91
+
92
+ this.writeString(view, 0, 'RIFF');
93
+ view.setUint32(4, 36 + intData.length * 2, true);
94
+ this.writeString(view, 8, 'WAVE');
95
+ this.writeString(view, 12, 'fmt ');
96
+ view.setUint32(16, 16, true);
97
+ view.setUint16(20, 1, true);
98
+ view.setUint16(22, 1, true);
99
+ view.setUint32(24, SpeechChunks.SAMPLE_RATE, true);
100
+ view.setUint32(28, SpeechChunks.SAMPLE_RATE * 2, true);
101
+ view.setUint16(32, 2, true);
102
+ view.setUint16(34, 16, true);
103
+ this.writeString(view, 36, 'data');
104
+ view.setUint32(40, intData.length * 2, true);
105
+
106
+ for (let i = 0; i < intData.length; i++) {
107
+ view.setInt16(44 + i * 2, intData[i], true);
108
+ }
109
+
110
+ const blob = new Blob([buffer], { type: 'audio/wav' });
111
+ console.log(`Created blob of size ${blob.size} bytes`);
112
+ return blob;
113
+ }
114
+
115
+ writeString(view, offset, string) {
116
+ for (let i = 0; i < string.length; i++) {
117
+ view.setUint8(offset + i, string.charCodeAt(i));
118
+ }
119
+ }
120
+
121
+ async close() {
122
+ console.log('Closing SpeechChunks');
123
+ this.stop();
124
+ await this.vadDetector.close();
125
+ }
126
+ }
VoiceActivityDetector.ts ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import OnnxWrapper from './Silero.ts';
2
+
3
+ const modelPath = "silero_vad.onnx"; // Make sure this path is correct
4
+
5
+ export class VadDetector {
6
+ constructor(startThreshold, endThreshold, samplingRate, minSilenceDurationMs, speechPadMs) {
7
+ if (samplingRate !== 8000 && samplingRate !== 16000) {
8
+ throw new Error("Does not support sampling rates other than [8000, 16000]");
9
+ }
10
+
11
+ this.model = new OnnxWrapper(modelPath);
12
+ this.startThreshold = startThreshold;
13
+ this.endThreshold = endThreshold;
14
+ this.samplingRate = samplingRate;
15
+ this.minSilenceSamples = samplingRate * minSilenceDurationMs / 1000;
16
+ this.speechPadSamples = samplingRate * speechPadMs / 1000;
17
+ this.reset();
18
+ console.log(`VadDetector initialized with: startThreshold=${startThreshold}, endThreshold=${endThreshold}, samplingRate=${samplingRate}`);
19
+ }
20
+
21
+ reset() {
22
+ this.model.resetStates();
23
+ this.triggered = false;
24
+ this.tempEnd = 0;
25
+ this.currentSample = 0;
26
+ console.log('VadDetector reset');
27
+ }
28
+
29
+ async apply(data, returnSeconds) {
30
+ console.log(`Applying VAD to data of length ${data.length}`);
31
+ const windowSizeSamples = data.length;
32
+ this.currentSample += windowSizeSamples;
33
+
34
+ const rowLength = this.samplingRate === 16000 ? 512 : 256;
35
+
36
+ // Ensure data is the correct length
37
+ if (data.length < rowLength) {
38
+ console.warn(`Input data length (${data.length}) is less than required (${rowLength}). Padding with zeros.`);
39
+ data = [...data, ...new Array(rowLength - data.length).fill(0)];
40
+ } else if (data.length > rowLength) {
41
+ console.warn(`Input data length (${data.length}) is greater than required (${rowLength}). Truncating.`);
42
+ data = data.slice(0, rowLength);
43
+ }
44
+
45
+ const x = [Array.from(data)];
46
+
47
+ let speechProb;
48
+ try {
49
+ console.log(`Calling model with input shape: [${x.length}, ${x[0].length}], sample rate: ${this.samplingRate}`);
50
+ const result = await this.model.call(x, this.samplingRate);
51
+ if (result && Array.isArray(result) && result[0] && result[0][0] !== undefined) {
52
+ speechProb = result[0][0];
53
+ console.log(`Speech probability: ${speechProb}`);
54
+ } else {
55
+ throw new Error("Unexpected response from model");
56
+ }
57
+ } catch (e) {
58
+ console.error("Error in VadDetector.apply:", e);
59
+ throw new Error("Error calling the model: " + e);
60
+ }
61
+
62
+ if (speechProb >= this.startThreshold && this.tempEnd !== 0) {
63
+ this.tempEnd = 0;
64
+ }
65
+
66
+ if (speechProb >= this.startThreshold && !this.triggered) {
67
+ this.triggered = true;
68
+ let speechStart = Math.max(this.currentSample - this.speechPadSamples, 0);
69
+ console.log(`Speech start detected at sample ${speechStart}`);
70
+ if (returnSeconds) {
71
+ const speechStartSeconds = speechStart / this.samplingRate;
72
+ return { start: Number(speechStartSeconds.toFixed(1)) };
73
+ } else {
74
+ return { start: speechStart };
75
+ }
76
+ }
77
+
78
+ if (speechProb < this.endThreshold && this.triggered) {
79
+ console.log(`Potential speech end at sample ${this.currentSample}`);
80
+ if (this.tempEnd === 0) {
81
+ this.tempEnd = this.currentSample;
82
+ }
83
+
84
+ if (this.currentSample - this.tempEnd < this.minSilenceSamples) {
85
+ console.log('Silence duration too short, continuing');
86
+ return {};
87
+ } else {
88
+ const speechEnd = this.tempEnd + this.speechPadSamples;
89
+ console.log(`Speech end confirmed at sample ${speechEnd}`);
90
+ this.tempEnd = 0;
91
+ this.triggered = false;
92
+
93
+ if (returnSeconds) {
94
+ const speechEndSeconds = speechEnd / this.samplingRate;
95
+ return { end: Number(speechEndSeconds.toFixed(1)) };
96
+ } else {
97
+ return { end: speechEnd };
98
+ }
99
+ }
100
+ }
101
+
102
+ return {};
103
+ }
104
+
105
+ async close() {
106
+ this.reset();
107
+ await this.model.close();
108
+ }
109
+ }
globals.css ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @tailwind base;
2
+ @tailwind components;
3
+ @tailwind utilities;
4
+
5
+ :root {
6
+ --foreground-rgb: 0, 0, 0;
7
+ --background-start-rgb: 214, 219, 220;
8
+ --background-end-rgb: 255, 255, 255;
9
+ }
10
+
11
+ @media (prefers-color-scheme: dark) {
12
+ :root {
13
+ --foreground-rgb: 255, 255, 255;
14
+ --background-start-rgb: 0, 0, 0;
15
+ --background-end-rgb: 0, 0, 0;
16
+ }
17
+ }
18
+
19
+ body {
20
+ color: rgb(var(--foreground-rgb));
21
+ background: linear-gradient(
22
+ to bottom,
23
+ transparent,
24
+ rgb(var(--background-end-rgb))
25
+ )
26
+ rgb(var(--background-start-rgb));
27
+ }
28
+
29
+ @layer utilities {
30
+ .text-balance {
31
+ text-wrap: balance;
32
+ }
33
+ }
index.html ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Voice Activity Detection Demo</title>
7
+ <script src="https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/ort.min.js"></script>
8
+ <script src="https://cdn.tailwindcss.com"></script>
9
+ <script src="https://unpkg.com/@babel/standalone/babel.min.js"></script>
10
+ <script>
11
+ ort.env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/';
12
+ </script>
13
+ </head>
14
+ <body>
15
+ <main class="flex min-h-screen flex-col items-center justify-between p-24">
16
+ <div class="text-center">
17
+ <div id="status" class="text-4xl mb-4">πŸ”‡ Not Listening</div>
18
+ <div id="audioList" class="space-y-4"></div>
19
+ </div>
20
+ </main>
21
+
22
+ <script type="module">
23
+ import { SpeechChunks } from './SpeechChunks.ts';
24
+
25
+ let speechChunks;
26
+
27
+ function updateStatus(isListening) {
28
+ document.getElementById('status').textContent = isListening ? "πŸŽ™οΈ Listening..." : "πŸ”‡ Not Listening";
29
+ }
30
+
31
+ function addAudioToList(blob) {
32
+ const audioList = document.getElementById('audioList');
33
+ const audio = document.createElement('audio');
34
+ audio.controls = true;
35
+ audio.src = URL.createObjectURL(blob);
36
+ audio.onended = () => URL.revokeObjectURL(audio.src);
37
+ audioList.appendChild(audio);
38
+ }
39
+
40
+ async function initializeSpeechChunks() {
41
+ try {
42
+ speechChunks = new SpeechChunks(
43
+ () => {
44
+ console.log("speech start");
45
+ updateStatus(true);
46
+ },
47
+ (blob) => {
48
+ console.log("speech end");
49
+ updateStatus(false);
50
+ addAudioToList(blob);
51
+ }
52
+ );
53
+ await speechChunks.start();
54
+ } catch (error) {
55
+ console.error("Error initializing SpeechChunks:", error);
56
+ updateStatus(false);
57
+ document.getElementById('status').textContent = "Error: " + error.message;
58
+ }
59
+ }
60
+
61
+ initializeSpeechChunks();
62
+ </script>
63
+ </body>
64
+ </html>