cgisky commited on
Commit
892b6ce
·
verified ·
1 Parent(s): 9bf261b

Upload 6 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ sherpa-onnx-wasm-main-tts.data filter=lfs diff=lfs merge=lfs -text
app-tts.js ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const generateBtn = document.getElementById('generateBtn');
2
+ const hint = document.getElementById('hint');
3
+ const speakerIdLabel = document.getElementById('speakerIdLabel');
4
+ const speakerIdInput = document.getElementById('speakerId');
5
+ const speedInput = document.getElementById('speed');
6
+ const speedValue = document.getElementById('speedValue');
7
+ const textArea = document.getElementById('text');
8
+ const soundClips = document.getElementById('sound-clips');
9
+
10
+ speedValue.innerHTML = speedInput.value;
11
+
12
+ let index = 0;
13
+
14
+
15
+ let tts = null;
16
+
17
+ let audioCtx = null;
18
+
19
+
20
+ Module = {};
21
+ Module.onRuntimeInitialized = function() {
22
+ console.log('Model files downloaded!');
23
+
24
+ console.log('Initializing tts ......');
25
+ tts = createOfflineTts(Module)
26
+ if (tts.numSpeakers > 1) {
27
+ speakerIdLabel.innerHTML = `Speaker ID (0 - ${tts.numSpeakers - 1}):`;
28
+ }
29
+
30
+ hint.innerText =
31
+ 'Initialized! Please enter text and click the Generate button.';
32
+
33
+
34
+
35
+ generateBtn.disabled = false;
36
+ };
37
+
38
+ speedInput.oninput = function() {
39
+ speedValue.innerHTML = this.value;
40
+ };
41
+
42
+ generateBtn.onclick = function() {
43
+ let speakerId = speakerIdInput.value;
44
+ if (speakerId.trim().length == 0) {
45
+ alert('Please input a speakerId');
46
+ return;
47
+ }
48
+
49
+ if (!speakerId.match(/^\d+$/)) {
50
+ alert(`Input speakerID ${
51
+ speakerId} is not a number.\nPlease enter a number between 0 and ${
52
+ tts.numSpeakers - 1}`);
53
+ return;
54
+ }
55
+ speakerId = parseInt(speakerId, 10);
56
+ if (speakerId > tts.numSpeakers - 1) {
57
+ alert(`Pleaser enter a number between 0 and ${tts.numSpeakers - 1}`);
58
+ return;
59
+ }
60
+
61
+ let text = textArea.value.trim();
62
+ if (text.length == 0) {
63
+ alert('Please input a non-blank text');
64
+ return;
65
+ }
66
+
67
+ console.log('speakerId', speakerId);
68
+ console.log('speed', speedInput.value);
69
+ console.log('text', text);
70
+
71
+ let audio =
72
+ tts.generate({text: text, sid: speakerId, speed: speedInput.value});
73
+
74
+ console.log(audio.samples.length, audio.sampleRate);
75
+
76
+ if (!audioCtx) {
77
+ audioCtx = new AudioContext({sampleRate: tts.sampleRate});
78
+ }
79
+
80
+ const buffer = audioCtx.createBuffer(1, audio.samples.length, tts.sampleRate);
81
+
82
+ const ptr = buffer.getChannelData(0);
83
+ for (let i = 0; i < audio.samples.length; i++) {
84
+ ptr[i] = audio.samples[i];
85
+ }
86
+ const source = audioCtx.createBufferSource();
87
+ source.buffer = buffer;
88
+ source.connect(audioCtx.destination);
89
+ source.start();
90
+
91
+ createAudioTag(audio);
92
+ };
93
+
94
+ function createAudioTag(generateAudio) {
95
+ const blob = toWav(generateAudio.samples, generateAudio.sampleRate);
96
+
97
+ const text = textArea.value.trim().substring(0, 100);
98
+ const clipName = `${index} ${text} ...`;
99
+ index += 1;
100
+
101
+ const clipContainer = document.createElement('article');
102
+ const clipLabel = document.createElement('p');
103
+ const audio = document.createElement('audio');
104
+ const deleteButton = document.createElement('button');
105
+ clipContainer.classList.add('clip');
106
+ audio.setAttribute('controls', '');
107
+ deleteButton.textContent = 'Delete';
108
+ deleteButton.className = 'delete';
109
+
110
+ clipLabel.textContent = clipName;
111
+
112
+ clipContainer.appendChild(audio);
113
+
114
+ clipContainer.appendChild(clipLabel);
115
+ clipContainer.appendChild(deleteButton);
116
+ soundClips.appendChild(clipContainer);
117
+
118
+ audio.controls = true;
119
+
120
+ const audioURL = window.URL.createObjectURL(blob);
121
+ audio.src = audioURL;
122
+
123
+ deleteButton.onclick = function(e) {
124
+ let evtTgt = e.target;
125
+ evtTgt.parentNode.parentNode.removeChild(evtTgt.parentNode);
126
+ };
127
+
128
+ clipLabel.onclick = function() {
129
+ const existingName = clipLabel.textContent;
130
+ const newClipName = prompt('Enter a new name for your sound clip?');
131
+ if (newClipName === null) {
132
+ clipLabel.textContent = existingName;
133
+ } else {
134
+ clipLabel.textContent = newClipName;
135
+ }
136
+ };
137
+ }
138
+
139
+ // this function is copied/modified from
140
+ // https://gist.github.com/meziantou/edb7217fddfbb70e899e
141
+ function toWav(floatSamples, sampleRate) {
142
+ let samples = new Int16Array(floatSamples.length);
143
+ for (let i = 0; i < samples.length; ++i) {
144
+ let s = floatSamples[i];
145
+ if (s >= 1)
146
+ s = 1;
147
+ else if (s <= -1)
148
+ s = -1;
149
+
150
+ samples[i] = s * 32767;
151
+ }
152
+
153
+ let buf = new ArrayBuffer(44 + samples.length * 2);
154
+ var view = new DataView(buf);
155
+
156
+ // http://soundfile.sapp.org/doc/WaveFormat/
157
+ // F F I R
158
+ view.setUint32(0, 0x46464952, true); // chunkID
159
+ view.setUint32(4, 36 + samples.length * 2, true); // chunkSize
160
+ // E V A W
161
+ view.setUint32(8, 0x45564157, true); // format
162
+ //
163
+ // t m f
164
+ view.setUint32(12, 0x20746d66, true); // subchunk1ID
165
+ view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM
166
+ view.setUint32(20, 1, true); // audioFormat, 1 for PCM
167
+ view.setUint16(22, 1, true); // numChannels: 1 channel
168
+ view.setUint32(24, sampleRate, true); // sampleRate
169
+ view.setUint32(28, sampleRate * 2, true); // byteRate
170
+ view.setUint16(32, 2, true); // blockAlign
171
+ view.setUint16(34, 16, true); // bitsPerSample
172
+ view.setUint32(36, 0x61746164, true); // Subchunk2ID
173
+ view.setUint32(40, samples.length * 2, true); // subchunk2Size
174
+
175
+ let offset = 44;
176
+ for (let i = 0; i < samples.length; ++i) {
177
+ view.setInt16(offset, samples[i], true);
178
+ offset += 2;
179
+ }
180
+
181
+ return new Blob([view], {type: 'audio/wav'});
182
+ }
index.html CHANGED
@@ -1,19 +1,46 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
19
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <html lang="en">
2
+
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <meta name="viewport" content="width=device-width" />
6
+ <title>Next-gen Kaldi WebAssembly with sherpa-onnx for Text-to-speech</title>
7
+ <style>
8
+ h1,div {
9
+ text-align: center;
10
+ }
11
+ textarea {
12
+ width:100%;
13
+ }
14
+ </style>
15
+ </head>
16
+
17
+ <body>
18
+ <h1>
19
+ Next-gen Kaldi + WebAssembly<br/>
20
+ Text-to-speech Demo with <a href="https://github.com/k2-fsa/sherpa-onnx">sherpa-onnx</a>
21
+ </h1>
22
+ <div>
23
+ <span id="hint">Loading model ... ...</span>
24
+ <br/>
25
+ <br/>
26
+ <label for="speakerId" id="speakerIdLabel">Speaker ID: </label>
27
+ <input type="text" id="speakerId" name="speakerId" value="0" />
28
+ <br/>
29
+ <br/>
30
+ <label for="speed" id="speedLabel">Speed: </label>
31
+ <input type="range" id="speed" name="speed" min="0.4" max="3.5" step="0.1" value="1.0" />
32
+ <span id="speedValue"></span>
33
+ <br/>
34
+ <br/>
35
+ <textarea id="text" rows="10" placeholder="Please enter your text here and click the Generate button"></textarea>
36
+ <br/>
37
+ <br/>
38
+ <button id="generateBtn" disabled>Generate</button>
39
+ </div>
40
+ <section flex="1" overflow="auto" id="sound-clips">
41
+ </section>
42
+
43
+ <script src="app-tts.js"></script>
44
+ <script src="sherpa-onnx-tts.js"></script>
45
+ <script src="sherpa-onnx-wasm-main-tts.js"></script>
46
+ </body>
sherpa-onnx-tts.js ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ function freeConfig(config, Module) {
3
+ if ('buffer' in config) {
4
+ Module._free(config.buffer);
5
+ }
6
+
7
+ if ('config' in config) {
8
+ freeConfig(config.config, Module)
9
+ }
10
+
11
+ if ('matcha' in config) {
12
+ freeConfig(config.matcha, Module)
13
+ }
14
+
15
+ if ('kokoro' in config) {
16
+ freeConfig(config.kokoro, Module)
17
+ }
18
+
19
+ Module._free(config.ptr);
20
+ }
21
+
22
+ // The user should free the returned pointers
23
+ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) {
24
+ const modelLen = Module.lengthBytesUTF8(config.model) + 1;
25
+ const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1;
26
+ const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1;
27
+ const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1;
28
+ const dictDirLen = Module.lengthBytesUTF8(config.dictDir || '') + 1;
29
+
30
+ const n = modelLen + lexiconLen + tokensLen + dataDirLen + dictDirLen;
31
+
32
+ const buffer = Module._malloc(n);
33
+
34
+ const len = 8 * 4;
35
+ const ptr = Module._malloc(len);
36
+
37
+ let offset = 0;
38
+ Module.stringToUTF8(config.model || '', buffer + offset, modelLen);
39
+ offset += modelLen;
40
+
41
+ Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen);
42
+ offset += lexiconLen;
43
+
44
+ Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen);
45
+ offset += tokensLen;
46
+
47
+ Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen);
48
+ offset += dataDirLen;
49
+
50
+ Module.stringToUTF8(config.dictDir || '', buffer + offset, dictDirLen);
51
+ offset += dictDirLen;
52
+
53
+ offset = 0;
54
+ Module.setValue(ptr, buffer + offset, 'i8*');
55
+ offset += modelLen;
56
+
57
+ Module.setValue(ptr + 4, buffer + offset, 'i8*');
58
+ offset += lexiconLen;
59
+
60
+ Module.setValue(ptr + 8, buffer + offset, 'i8*');
61
+ offset += tokensLen;
62
+
63
+ Module.setValue(ptr + 12, buffer + offset, 'i8*');
64
+ offset += dataDirLen;
65
+
66
+ Module.setValue(ptr + 16, config.noiseScale || 0.667, 'float');
67
+ Module.setValue(ptr + 20, config.noiseScaleW || 0.8, 'float');
68
+ Module.setValue(ptr + 24, config.lengthScale || 1.0, 'float');
69
+ Module.setValue(ptr + 28, buffer + offset, 'i8*');
70
+ offset += dictDirLen;
71
+
72
+ return {
73
+ buffer: buffer, ptr: ptr, len: len,
74
+ }
75
+ }
76
+
77
+ function initSherpaOnnxOfflineTtsMatchaModelConfig(config, Module) {
78
+ const acousticModelLen = Module.lengthBytesUTF8(config.acousticModel) + 1;
79
+ const vocoderLen = Module.lengthBytesUTF8(config.vocoder) + 1;
80
+ const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1;
81
+ const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1;
82
+ const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1;
83
+ const dictDirLen = Module.lengthBytesUTF8(config.dictDir || '') + 1;
84
+
85
+ const n = acousticModelLen + vocoderLen + lexiconLen + tokensLen +
86
+ dataDirLen + dictDirLen;
87
+
88
+ const buffer = Module._malloc(n);
89
+
90
+ const len = 8 * 4;
91
+ const ptr = Module._malloc(len);
92
+
93
+ let offset = 0;
94
+ Module.stringToUTF8(
95
+ config.acousticModel || '', buffer + offset, acousticModelLen);
96
+ offset += acousticModelLen;
97
+
98
+ Module.stringToUTF8(config.vocoder || '', buffer + offset, vocoderLen);
99
+ offset += vocoderLen;
100
+
101
+ Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen);
102
+ offset += lexiconLen;
103
+
104
+ Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen);
105
+ offset += tokensLen;
106
+
107
+ Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen);
108
+ offset += dataDirLen;
109
+
110
+ Module.stringToUTF8(config.dictDir || '', buffer + offset, dictDirLen);
111
+ offset += dictDirLen;
112
+
113
+ offset = 0;
114
+ Module.setValue(ptr, buffer + offset, 'i8*');
115
+ offset += acousticModelLen;
116
+
117
+ Module.setValue(ptr + 4, buffer + offset, 'i8*');
118
+ offset += vocoderLen;
119
+
120
+ Module.setValue(ptr + 8, buffer + offset, 'i8*');
121
+ offset += lexiconLen;
122
+
123
+ Module.setValue(ptr + 12, buffer + offset, 'i8*');
124
+ offset += tokensLen;
125
+
126
+ Module.setValue(ptr + 16, buffer + offset, 'i8*');
127
+ offset += dataDirLen;
128
+
129
+ Module.setValue(ptr + 20, config.noiseScale || 0.667, 'float');
130
+ Module.setValue(ptr + 24, config.lengthScale || 1.0, 'float');
131
+ Module.setValue(ptr + 28, buffer + offset, 'i8*');
132
+ offset += dictDirLen;
133
+
134
+ return {
135
+ buffer: buffer, ptr: ptr, len: len,
136
+ }
137
+ }
138
+
139
+ function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) {
140
+ const modelLen = Module.lengthBytesUTF8(config.model) + 1;
141
+ const voicesLen = Module.lengthBytesUTF8(config.voices) + 1;
142
+ const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1;
143
+ const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1;
144
+ const dictDirLen = Module.lengthBytesUTF8(config.dictDir || '') + 1;
145
+ const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1;
146
+
147
+ const n =
148
+ modelLen + voicesLen + tokensLen + dataDirLen + dictDirLen + lexiconLen;
149
+
150
+ const buffer = Module._malloc(n);
151
+
152
+ const len = 7 * 4;
153
+ const ptr = Module._malloc(len);
154
+
155
+ let offset = 0;
156
+ Module.stringToUTF8(config.model || '', buffer + offset, modelLen);
157
+ offset += modelLen;
158
+
159
+ Module.stringToUTF8(config.voices || '', buffer + offset, voicesLen);
160
+ offset += voicesLen;
161
+
162
+ Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen);
163
+ offset += tokensLen;
164
+
165
+ Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen);
166
+ offset += dataDirLen;
167
+
168
+ Module.stringToUTF8(config.dictDir || '', buffer + offset, dictDirLen);
169
+ offset += dictDirLen;
170
+
171
+ Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen);
172
+ offset += lexiconLen;
173
+
174
+ offset = 0;
175
+ Module.setValue(ptr, buffer + offset, 'i8*');
176
+ offset += modelLen;
177
+
178
+ Module.setValue(ptr + 4, buffer + offset, 'i8*');
179
+ offset += voicesLen;
180
+
181
+ Module.setValue(ptr + 8, buffer + offset, 'i8*');
182
+ offset += tokensLen;
183
+
184
+ Module.setValue(ptr + 12, buffer + offset, 'i8*');
185
+ offset += dataDirLen;
186
+
187
+ Module.setValue(ptr + 16, config.lengthScale || 1.0, 'float');
188
+
189
+ Module.setValue(ptr + 20, buffer + offset, 'i8*');
190
+ offset += dictDirLen;
191
+
192
+ Module.setValue(ptr + 24, buffer + offset, 'i8*');
193
+ offset += lexiconLen;
194
+
195
+ return {
196
+ buffer: buffer, ptr: ptr, len: len,
197
+ }
198
+ }
199
+
200
+ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
201
+ if (!('offlineTtsVitsModelConfig' in config)) {
202
+ config.offlineTtsVitsModelConfig = {
203
+ model: '',
204
+ lexicon: '',
205
+ tokens: '',
206
+ noiseScale: 0.667,
207
+ noiseScaleW: 0.8,
208
+ lengthScale: 1.0,
209
+ dataDir: '',
210
+ dictDir: '',
211
+ };
212
+ }
213
+
214
+ if (!('offlineTtsMatchaModelConfig' in config)) {
215
+ config.offlineTtsMatchaModelConfig = {
216
+ acousticModel: '',
217
+ vocoder: '',
218
+ lexicon: '',
219
+ tokens: '',
220
+ noiseScale: 0.667,
221
+ lengthScale: 1.0,
222
+ dataDir: '',
223
+ dictDir: '',
224
+ };
225
+ }
226
+
227
+ if (!('offlineTtsKokoroModelConfig' in config)) {
228
+ config.offlineTtsKokoroModelConfig = {
229
+ model: '',
230
+ voices: '',
231
+ tokens: '',
232
+ lengthScale: 1.0,
233
+ dataDir: '',
234
+ dictDir: '',
235
+ lexicon: '',
236
+ };
237
+ }
238
+
239
+
240
+ const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig(
241
+ config.offlineTtsVitsModelConfig, Module);
242
+
243
+ const matchaModelConfig = initSherpaOnnxOfflineTtsMatchaModelConfig(
244
+ config.offlineTtsMatchaModelConfig, Module);
245
+
246
+ const kokoroModelConfig = initSherpaOnnxOfflineTtsKokoroModelConfig(
247
+ config.offlineTtsKokoroModelConfig, Module);
248
+
249
+ const len = vitsModelConfig.len + matchaModelConfig.len +
250
+ kokoroModelConfig.len + 3 * 4;
251
+
252
+ const ptr = Module._malloc(len);
253
+
254
+ let offset = 0;
255
+ Module._CopyHeap(vitsModelConfig.ptr, vitsModelConfig.len, ptr + offset);
256
+ offset += vitsModelConfig.len;
257
+
258
+ Module.setValue(ptr + offset, config.numThreads || 1, 'i32');
259
+ offset += 4;
260
+
261
+ Module.setValue(ptr + offset, config.debug || 0, 'i32');
262
+ offset += 4;
263
+
264
+ const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1;
265
+ const buffer = Module._malloc(providerLen);
266
+ Module.stringToUTF8(config.provider, buffer, providerLen);
267
+ Module.setValue(ptr + offset, buffer, 'i8*');
268
+ offset += 4;
269
+
270
+ Module._CopyHeap(matchaModelConfig.ptr, matchaModelConfig.len, ptr + offset);
271
+ offset += matchaModelConfig.len;
272
+
273
+ Module._CopyHeap(kokoroModelConfig.ptr, kokoroModelConfig.len, ptr + offset);
274
+ offset += kokoroModelConfig.len;
275
+
276
+ return {
277
+ buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig,
278
+ matcha: matchaModelConfig, kokoro: kokoroModelConfig,
279
+ }
280
+ }
281
+
282
+ function initSherpaOnnxOfflineTtsConfig(config, Module) {
283
+ const modelConfig =
284
+ initSherpaOnnxOfflineTtsModelConfig(config.offlineTtsModelConfig, Module);
285
+ const len = modelConfig.len + 3 * 4;
286
+ const ptr = Module._malloc(len);
287
+
288
+ let offset = 0;
289
+ Module._CopyHeap(modelConfig.ptr, modelConfig.len, ptr + offset);
290
+ offset += modelConfig.len;
291
+
292
+ const ruleFstsLen = Module.lengthBytesUTF8(config.ruleFsts || '') + 1;
293
+ const ruleFarsLen = Module.lengthBytesUTF8(config.ruleFars || '') + 1;
294
+
295
+ const buffer = Module._malloc(ruleFstsLen + ruleFarsLen);
296
+ Module.stringToUTF8(config.ruleFsts || '', buffer, ruleFstsLen);
297
+ Module.stringToUTF8(config.ruleFars || '', buffer + ruleFstsLen, ruleFarsLen);
298
+
299
+ Module.setValue(ptr + offset, buffer, 'i8*');
300
+ offset += 4;
301
+
302
+ Module.setValue(ptr + offset, config.maxNumSentences || 1, 'i32');
303
+ offset += 4;
304
+
305
+ Module.setValue(ptr + offset, buffer + ruleFstsLen, 'i8*');
306
+
307
+ return {
308
+ buffer: buffer, ptr: ptr, len: len, config: modelConfig,
309
+ }
310
+ }
311
+
312
+ class OfflineTts {
313
+ constructor(configObj, Module) {
314
+ console.log(configObj)
315
+ const config = initSherpaOnnxOfflineTtsConfig(configObj, Module)
316
+ const handle = Module._SherpaOnnxCreateOfflineTts(config.ptr);
317
+
318
+ freeConfig(config, Module);
319
+
320
+ this.handle = handle;
321
+ this.sampleRate = Module._SherpaOnnxOfflineTtsSampleRate(this.handle);
322
+ this.numSpeakers = Module._SherpaOnnxOfflineTtsNumSpeakers(this.handle);
323
+ this.Module = Module
324
+ }
325
+
326
+ free() {
327
+ this.Module._SherpaOnnxDestroyOfflineTts(this.handle);
328
+ this.handle = 0
329
+ }
330
+
331
+ // {
332
+ // text: "hello",
333
+ // sid: 1,
334
+ // speed: 1.0
335
+ // }
336
+ generate(config) {
337
+ const textLen = this.Module.lengthBytesUTF8(config.text) + 1;
338
+ const textPtr = this.Module._malloc(textLen);
339
+ this.Module.stringToUTF8(config.text, textPtr, textLen);
340
+
341
+ const h = this.Module._SherpaOnnxOfflineTtsGenerate(
342
+ this.handle, textPtr, config.sid, config.speed);
343
+
344
+ const numSamples = this.Module.HEAP32[h / 4 + 1];
345
+ const sampleRate = this.Module.HEAP32[h / 4 + 2];
346
+
347
+ const samplesPtr = this.Module.HEAP32[h / 4] / 4;
348
+ const samples = new Float32Array(numSamples);
349
+ for (let i = 0; i < numSamples; i++) {
350
+ samples[i] = this.Module.HEAPF32[samplesPtr + i];
351
+ }
352
+
353
+ this.Module._SherpaOnnxDestroyOfflineTtsGeneratedAudio(h);
354
+ return {samples: samples, sampleRate: sampleRate};
355
+ }
356
+ save(filename, audio) {
357
+ const samples = audio.samples;
358
+ const sampleRate = audio.sampleRate;
359
+ const ptr = this.Module._malloc(samples.length * 4);
360
+ for (let i = 0; i < samples.length; i++) {
361
+ this.Module.HEAPF32[ptr / 4 + i] = samples[i];
362
+ }
363
+
364
+ const filenameLen = this.Module.lengthBytesUTF8(filename) + 1;
365
+ const buffer = this.Module._malloc(filenameLen);
366
+ this.Module.stringToUTF8(filename, buffer, filenameLen);
367
+ this.Module._SherpaOnnxWriteWave(ptr, samples.length, sampleRate, buffer);
368
+ this.Module._free(buffer);
369
+ this.Module._free(ptr);
370
+ }
371
+ }
372
+
373
+ function createOfflineTts(Module, myConfig) {
374
+ const offlineTtsVitsModelConfig = {
375
+ lexicon: '',
376
+ tokens: '',
377
+ dataDir: './espeak-ng-data',
378
+ dictDir: '',
379
+ noiseScale: 0.667,
380
+ noiseScaleW: 0.8,
381
+ lengthScale: 1.0,
382
+ };
383
+
384
+ const offlineTtsMatchaModelConfig = {
385
+ acousticModel: '',
386
+ vocoder: '',
387
+ lexicon: '',
388
+ tokens: '',
389
+ dataDir: '',
390
+ dictDir: '',
391
+ noiseScale: 0.667,
392
+ lengthScale: 1.0,
393
+ };
394
+
395
+ const offlineTtsKokoroModelConfig = {
396
+ model: './model.onnx',
397
+ voices: './voices.bin',
398
+ tokens: './tokens.txt',
399
+ dataDir: './espeak-ng-data',
400
+ lengthScale: 1.0,
401
+ dictDir: './dic',
402
+ lexicon: './lexicon-us-en.txt,./lexicon-zh.txt',
403
+ };
404
+
405
+ const offlineTtsModelConfig = {
406
+ offlineTtsVitsModelConfig: offlineTtsVitsModelConfig,
407
+ offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig,
408
+ offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig,
409
+ numThreads: 1,
410
+ debug: 1,
411
+ provider: 'cpu',
412
+ };
413
+
414
+ let offlineTtsConfig = {
415
+ offlineTtsModelConfig: offlineTtsModelConfig,
416
+ ruleFsts: '',
417
+ ruleFars: '',
418
+ maxNumSentences: 1,
419
+ }
420
+
421
+ if (myConfig) {
422
+ offlineTtsConfig = myConfig;
423
+ }
424
+
425
+ return new OfflineTts(offlineTtsConfig, Module);
426
+ }
427
+
428
+ if (typeof process == 'object' && typeof process.versions == 'object' &&
429
+ typeof process.versions.node == 'string') {
430
+ module.exports = {
431
+ createOfflineTts,
432
+ };
433
+ }
sherpa-onnx-wasm-main-tts.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c357fc1a6d0c10ecff7db942dab4aeab2ae29ab7285fbeeb7e0d7d95451b2b7
3
+ size 399772723
sherpa-onnx-wasm-main-tts.js ADDED
The diff for this file is too large to render. See raw diff
 
sherpa-onnx-wasm-main-tts.wasm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87ab618b91545a84ff6152d5247249fec658839aa15c2401cfcee198f8e252f5
3
+ size 11738971