mkiol commited on
Commit
2b85a29
1 Parent(s): a01d6dd

add mimic3 models

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +32 -0
  2. voices/af_ZA/google-nwu_low/ALIASES +3 -0
  3. voices/af_ZA/google-nwu_low/LICENSE +1 -0
  4. voices/af_ZA/google-nwu_low/README.md +311 -0
  5. voices/af_ZA/google-nwu_low/README.md.in +5 -0
  6. voices/af_ZA/google-nwu_low/SOURCE +1 -0
  7. voices/af_ZA/google-nwu_low/VERSION +1 -0
  8. voices/af_ZA/google-nwu_low/config.json +166 -0
  9. voices/af_ZA/google-nwu_low/generator.onnx +3 -0
  10. voices/af_ZA/google-nwu_low/phonemes.txt +60 -0
  11. voices/af_ZA/google-nwu_low/speaker_map.csv +9 -0
  12. voices/af_ZA/google-nwu_low/speakers.txt +9 -0
  13. voices/bn/multi_low/ALIASES +2 -0
  14. voices/bn/multi_low/README.md +299 -0
  15. voices/bn/multi_low/README.md.in +8 -0
  16. voices/bn/multi_low/SOURCE +2 -0
  17. voices/bn/multi_low/VERSION +1 -0
  18. voices/bn/multi_low/cmu-indic/LICENSE +20 -0
  19. voices/bn/multi_low/cmu-indic/SOURCE +1 -0
  20. voices/bn/multi_low/config.json +154 -0
  21. voices/bn/multi_low/generator.onnx +3 -0
  22. voices/bn/multi_low/google/LICENSE +1 -0
  23. voices/bn/multi_low/google/SOURCE +1 -0
  24. voices/bn/multi_low/phoneme_map.txt +1 -0
  25. voices/bn/multi_low/phonemes.txt +57 -0
  26. voices/bn/multi_low/speaker_map.csv +16 -0
  27. voices/bn/multi_low/speakers.txt +16 -0
  28. voices/de_DE/m-ailabs_low/ALIASES +1 -0
  29. voices/de_DE/m-ailabs_low/LICENSE +8 -0
  30. voices/de_DE/m-ailabs_low/README.md +296 -0
  31. voices/de_DE/m-ailabs_low/README.md.in +5 -0
  32. voices/de_DE/m-ailabs_low/SOURCE +1 -0
  33. voices/de_DE/m-ailabs_low/VERSION +1 -0
  34. voices/de_DE/m-ailabs_low/config.json +151 -0
  35. voices/de_DE/m-ailabs_low/generator.onnx +3 -0
  36. voices/de_DE/m-ailabs_low/phoneme_map.txt +2 -0
  37. voices/de_DE/m-ailabs_low/phonemes.txt +57 -0
  38. voices/de_DE/m-ailabs_low/speaker_map.csv +5 -0
  39. voices/de_DE/m-ailabs_low/speakers.txt +5 -0
  40. voices/de_DE/thorsten-emotion_low/LICENSE +1 -0
  41. voices/de_DE/thorsten-emotion_low/README.md +291 -0
  42. voices/de_DE/thorsten-emotion_low/README.md.in +5 -0
  43. voices/de_DE/thorsten-emotion_low/SOURCE +1 -0
  44. voices/de_DE/thorsten-emotion_low/VERSION +1 -0
  45. voices/de_DE/thorsten-emotion_low/config.json +157 -0
  46. voices/de_DE/thorsten-emotion_low/generator.onnx +3 -0
  47. voices/de_DE/thorsten-emotion_low/phonemes.txt +56 -0
  48. voices/de_DE/thorsten-emotion_low/speaker_map.csv +8 -0
  49. voices/de_DE/thorsten-emotion_low/speakers.txt +8 -0
  50. voices/de_DE/thorsten_low/ALIASES +4 -0
README.md CHANGED
@@ -1,3 +1,35 @@
1
  ---
2
  license: cc-by-sa-4.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
1
  ---
2
  license: cc-by-sa-4.0
3
+ language:
4
+ - af
5
+ - bn
6
+ - de
7
+ - el
8
+ - en
9
+ - en
10
+ - es
11
+ - fa
12
+ - fi
13
+ - fr
14
+ - gu
15
+ - ha
16
+ - hu
17
+ - it
18
+ - jv
19
+ - ko
20
+ - ne
21
+ - nl
22
+ - pl
23
+ - ru
24
+ - sw
25
+ - te
26
+ - tn
27
+ - uk
28
+ - vi
29
+ - yo
30
  ---
31
+
32
+ Voice models for the Mimic 3 text to speech system.
33
+
34
+ Original source: https://github.com/MycroftAI/mimic3-voices
35
+
voices/af_ZA/google-nwu_low/ALIASES ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ af
2
+ af_ZA
3
+ af_ZA/google-nwu
voices/af_ZA/google-nwu_low/LICENSE ADDED
@@ -0,0 +1 @@
 
 
1
+ Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)
voices/af_ZA/google-nwu_low/README.md ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Afrikaans Google/North West University (Low Quality)
2
+
3
+ A multi-speaker model for Afrikaans based on the [Google/NWU dataset](http://www.openslr.org/32/).
4
+
5
+ See LICENSE file for license.
6
+
7
+
8
+ ## Phonemes
9
+
10
+ <table><thead><th>&nbsp;</th><th>Phoneme</th><th>Description</th></thead>
11
+ <tr>
12
+ <td> 0 </td>
13
+ <td> _ </td>
14
+ <td> padding </td>
15
+ </tr>
16
+ <tr>
17
+ <td> 1 </td>
18
+ <td> ^ </td>
19
+ <td> start utterance </td>
20
+ </tr>
21
+ <tr>
22
+ <td> 2 </td>
23
+ <td> $ </td>
24
+ <td> end utterance </td>
25
+ </tr>
26
+ <tr>
27
+ <td> 3 </td>
28
+ <td> , </td>
29
+ <td> short pause (minor break) </td>
30
+ </tr>
31
+ <tr>
32
+ <td> 4 </td>
33
+ <td> . </td>
34
+ <td> long pause (major break) </td>
35
+ </tr>
36
+ <tr>
37
+ <td> 5 </td>
38
+ <td> # </td>
39
+ <td> word break </td>
40
+ </tr>
41
+ <tr>
42
+ <td> 6 </td>
43
+ <td> ˈ </td>
44
+ <td> primary stress </td>
45
+ </tr>
46
+ <tr>
47
+ <td> 7 </td>
48
+ <td> ˌ </td>
49
+ <td> secondary stress </td>
50
+ </tr>
51
+ <tr>
52
+ <td> 8 </td>
53
+ <td> - </td>
54
+ <td> </td>
55
+ </tr>
56
+ <tr>
57
+ <td> 9 </td>
58
+ <td> a </td>
59
+ <td> vowel open front unrounded [<a title="Audio sample for vowel open front unrounded " href="../../../phonemes/open_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
60
+ </tr>
61
+ <tr>
62
+ <td> 10 </td>
63
+ <td> b </td>
64
+ <td> consonant plosive bilabial voiced [<a title="Audio sample for consonant plosive bilabial voiced " href="../../../phonemes/voiced_bilabial_plosive.wav?raw=true">Sample</a>] </td>
65
+ </tr>
66
+ <tr>
67
+ <td> 11 </td>
68
+ <td> c </td>
69
+ <td> consonant plosive palatal unvoiced [<a title="Audio sample for consonant plosive palatal unvoiced " href="../../../phonemes/voiceless_palatal_plosive.wav?raw=true">Sample</a>] </td>
70
+ </tr>
71
+ <tr>
72
+ <td> 12 </td>
73
+ <td> d </td>
74
+ <td> consonant plosive alveolar voiced [<a title="Audio sample for consonant plosive alveolar voiced " href="../../../phonemes/voiced_alveolar_plosive.wav?raw=true">Sample</a>] </td>
75
+ </tr>
76
+ <tr>
77
+ <td> 13 </td>
78
+ <td> e </td>
79
+ <td> vowel close-mid front unrounded [<a title="Audio sample for vowel close-mid front unrounded " href="../../../phonemes/close-mid_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
80
+ </tr>
81
+ <tr>
82
+ <td> 14 </td>
83
+ <td> f </td>
84
+ <td> consonant fricative labio-dental unvoiced [<a title="Audio sample for consonant fricative labio-dental unvoiced " href="../../../phonemes/voiceless_labiodental_fricative.wav?raw=true">Sample</a>] </td>
85
+ </tr>
86
+ <tr>
87
+ <td> 15 </td>
88
+ <td> h </td>
89
+ <td> consonant fricative glottal unvoiced [<a title="Audio sample for consonant fricative glottal unvoiced " href="../../../phonemes/voiceless_glottal_fricative.wav?raw=true">Sample</a>] </td>
90
+ </tr>
91
+ <tr>
92
+ <td> 16 </td>
93
+ <td> i </td>
94
+ <td> vowel close front unrounded [<a title="Audio sample for vowel close front unrounded " href="../../../phonemes/close_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
95
+ </tr>
96
+ <tr>
97
+ <td> 17 </td>
98
+ <td> j </td>
99
+ <td> consonant approximant palatal voiced [<a title="Audio sample for consonant approximant palatal voiced " href="../../../phonemes/palatal_approximant.wav?raw=true">Sample</a>] </td>
100
+ </tr>
101
+ <tr>
102
+ <td> 18 </td>
103
+ <td> k </td>
104
+ <td> consonant plosive velar unvoiced [<a title="Audio sample for consonant plosive velar unvoiced " href="../../../phonemes/voiceless_velar_plosive.wav?raw=true">Sample</a>] </td>
105
+ </tr>
106
+ <tr>
107
+ <td> 19 </td>
108
+ <td> l </td>
109
+ <td> consonant lateral-approximant alveolar voiced [<a title="Audio sample for consonant lateral-approximant alveolar voiced " href="../../../phonemes/alveolar_lateral_approximant.wav?raw=true">Sample</a>] </td>
110
+ </tr>
111
+ <tr>
112
+ <td> 20 </td>
113
+ <td> m </td>
114
+ <td> consonant nasal bilabial voiced [<a title="Audio sample for consonant nasal bilabial voiced " href="../../../phonemes/bilabial_nasal.wav?raw=true">Sample</a>] </td>
115
+ </tr>
116
+ <tr>
117
+ <td> 21 </td>
118
+ <td> n </td>
119
+ <td> consonant nasal alveolar voiced [<a title="Audio sample for consonant nasal alveolar voiced " href="../../../phonemes/alveolar_nasal.wav?raw=true">Sample</a>] </td>
120
+ </tr>
121
+ <tr>
122
+ <td> 22 </td>
123
+ <td> o </td>
124
+ <td> vowel close-mid back rounded [<a title="Audio sample for vowel close-mid back rounded " href="../../../phonemes/close-mid_back_rounded_vowel.wav?raw=true">Sample</a>] </td>
125
+ </tr>
126
+ <tr>
127
+ <td> 23 </td>
128
+ <td> p </td>
129
+ <td> consonant plosive bilabial unvoiced [<a title="Audio sample for consonant plosive bilabial unvoiced " href="../../../phonemes/voiceless_bilabial_plosive.wav?raw=true">Sample</a>] </td>
130
+ </tr>
131
+ <tr>
132
+ <td> 24 </td>
133
+ <td> r </td>
134
+ <td> consonant trill alveolar voiced [<a title="Audio sample for consonant trill alveolar voiced " href="../../../phonemes/alveolar_trill.wav?raw=true">Sample</a>] </td>
135
+ </tr>
136
+ <tr>
137
+ <td> 25 </td>
138
+ <td> s </td>
139
+ <td> consonant fricative alveolar unvoiced [<a title="Audio sample for consonant fricative alveolar unvoiced " href="../../../phonemes/voiceless_alveolar_fricative.wav?raw=true">Sample</a>] </td>
140
+ </tr>
141
+ <tr>
142
+ <td> 26 </td>
143
+ <td> t </td>
144
+ <td> consonant plosive alveolar unvoiced [<a title="Audio sample for consonant plosive alveolar unvoiced " href="../../../phonemes/voiceless_alveolar_plosive.wav?raw=true">Sample</a>] </td>
145
+ </tr>
146
+ <tr>
147
+ <td> 27 </td>
148
+ <td> u </td>
149
+ <td> vowel close back rounded [<a title="Audio sample for vowel close back rounded " href="../../../phonemes/close_back_rounded_vowel.wav?raw=true">Sample</a>] </td>
150
+ </tr>
151
+ <tr>
152
+ <td> 28 </td>
153
+ <td> v </td>
154
+ <td> consonant fricative labio-dental voiced [<a title="Audio sample for consonant fricative labio-dental voiced " href="../../../phonemes/voiced_labiodental_fricative.wav?raw=true">Sample</a>] </td>
155
+ </tr>
156
+ <tr>
157
+ <td> 29 </td>
158
+ <td> w </td>
159
+ <td> consonant approximant bilabial voiced [<a title="Audio sample for consonant approximant bilabial voiced " href="../../../phonemes/voiced_bilabial_approximant.wav?raw=true">Sample</a>] </td>
160
+ </tr>
161
+ <tr>
162
+ <td> 30 </td>
163
+ <td> x </td>
164
+ <td> consonant fricative velar unvoiced [<a title="Audio sample for consonant fricative velar unvoiced " href="../../../phonemes/voiceless_velar_fricative.wav?raw=true">Sample</a>] </td>
165
+ </tr>
166
+ <tr>
167
+ <td> 31 </td>
168
+ <td> y </td>
169
+ <td> vowel close front rounded [<a title="Audio sample for vowel close front rounded " href="../../../phonemes/close_front_rounded_vowel.wav?raw=true">Sample</a>] </td>
170
+ </tr>
171
+ <tr>
172
+ <td> 32 </td>
173
+ <td> z </td>
174
+ <td> consonant fricative alveolar voiced [<a title="Audio sample for consonant fricative alveolar voiced " href="../../../phonemes/voiced_alveolar_fricative.wav?raw=true">Sample</a>] </td>
175
+ </tr>
176
+ <tr>
177
+ <td> 33 </td>
178
+ <td> æ </td>
179
+ <td> vowel near-open front unrounded [<a title="Audio sample for vowel near-open front unrounded " href="../../../phonemes/near-open_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
180
+ </tr>
181
+ <tr>
182
+ <td> 34 </td>
183
+ <td> ð </td>
184
+ <td> consonant fricative dental voiced [<a title="Audio sample for consonant fricative dental voiced " href="../../../phonemes/voiced_dental_fricative.wav?raw=true">Sample</a>] </td>
185
+ </tr>
186
+ <tr>
187
+ <td> 35 </td>
188
+ <td> õ </td>
189
+ <td> vowel close-mid back rounded [<a title="Audio sample for vowel close-mid back rounded " href="../../../phonemes/close-mid_back_rounded_vowel.wav?raw=true">Sample</a>] </td>
190
+ </tr>
191
+ <tr>
192
+ <td> 36 </td>
193
+ <td> ø </td>
194
+ <td> vowel close-mid front rounded [<a title="Audio sample for vowel close-mid front rounded " href="../../../phonemes/close-mid_front_rounded_vowel.wav?raw=true">Sample</a>] </td>
195
+ </tr>
196
+ <tr>
197
+ <td> 37 </td>
198
+ <td> ŋ </td>
199
+ <td> consonant nasal velar voiced [<a title="Audio sample for consonant nasal velar voiced " href="../../../phonemes/velar_nasal.wav?raw=true">Sample</a>] </td>
200
+ </tr>
201
+ <tr>
202
+ <td> 38 </td>
203
+ <td> œ </td>
204
+ <td> vowel open-mid front rounded [<a title="Audio sample for vowel open-mid front rounded " href="../../../phonemes/open-mid_front_rounded_vowel.wav?raw=true">Sample</a>] </td>
205
+ </tr>
206
+ <tr>
207
+ <td> 39 </td>
208
+ <td> ɐ </td>
209
+ <td> vowel near-open central unrounded [<a title="Audio sample for vowel near-open central unrounded " href="../../../phonemes/near-open_central_unrounded_vowel.wav?raw=true">Sample</a>] </td>
210
+ </tr>
211
+ <tr>
212
+ <td> 40 </td>
213
+ <td> ɑ </td>
214
+ <td> vowel open back unrounded [<a title="Audio sample for vowel open back unrounded " href="../../../phonemes/open_back_unrounded_vowel.wav?raw=true">Sample</a>] </td>
215
+ </tr>
216
+ <tr>
217
+ <td> 41 </td>
218
+ <td> ɑ̃ </td>
219
+ <td> vowel open back unrounded [<a title="Audio sample for vowel open back unrounded " href="../../../phonemes/open_back_unrounded_vowel.wav?raw=true">Sample</a>] </td>
220
+ </tr>
221
+ <tr>
222
+ <td> 42 </td>
223
+ <td> ɒ </td>
224
+ <td> vowel open back rounded [<a title="Audio sample for vowel open back rounded " href="../../../phonemes/open_back_rounded_vowel.wav?raw=true">Sample</a>] </td>
225
+ </tr>
226
+ <tr>
227
+ <td> 43 </td>
228
+ <td> ɔ </td>
229
+ <td> vowel open-mid back rounded [<a title="Audio sample for vowel open-mid back rounded " href="../../../phonemes/open-mid_back_rounded_vowel.wav?raw=true">Sample</a>] </td>
230
+ </tr>
231
+ <tr>
232
+ <td> 44 </td>
233
+ <td> ə </td>
234
+ <td> vowel mid central unrounded </td>
235
+ </tr>
236
+ <tr>
237
+ <td> 45 </td>
238
+ <td> ɛ </td>
239
+ <td> vowel open-mid front unrounded [<a title="Audio sample for vowel open-mid front unrounded " href="../../../phonemes/open-mid_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
240
+ </tr>
241
+ <tr>
242
+ <td> 46 </td>
243
+ <td> ɜ </td>
244
+ <td> vowel open-mid central unrounded [<a title="Audio sample for vowel open-mid central unrounded " href="../../../phonemes/open-mid_central_unrounded_vowel.wav?raw=true">Sample</a>] </td>
245
+ </tr>
246
+ <tr>
247
+ <td> 47 </td>
248
+ <td> ɡ </td>
249
+ <td> consonant plosive velar voiced [<a title="Audio sample for consonant plosive velar voiced " href="../../../phonemes/voiced_velar_plosive.wav?raw=true">Sample</a>] </td>
250
+ </tr>
251
+ <tr>
252
+ <td> 48 </td>
253
+ <td> ɪ </td>
254
+ <td> vowel near-close near-front unrounded [<a title="Audio sample for vowel near-close near-front unrounded " href="../../../phonemes/near-close_near-front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
255
+ </tr>
256
+ <tr>
257
+ <td> 49 </td>
258
+ <td> ɬ </td>
259
+ <td> </td>
260
+ </tr>
261
+ <tr>
262
+ <td> 50 </td>
263
+ <td> ɹ </td>
264
+ <td> consonant approximant alveolar voiced [<a title="Audio sample for consonant approximant alveolar voiced " href="../../../phonemes/alveolar_approximant.wav?raw=true">Sample</a>] </td>
265
+ </tr>
266
+ <tr>
267
+ <td> 51 </td>
268
+ <td> ʁ </td>
269
+ <td> consonant fricative uvular voiced [<a title="Audio sample for consonant fricative uvular voiced " href="../../../phonemes/voiced_uvular_fricative.wav?raw=true">Sample</a>] </td>
270
+ </tr>
271
+ <tr>
272
+ <td> 52 </td>
273
+ <td> ʃ </td>
274
+ <td> consonant fricative post-alveolar unvoiced [<a title="Audio sample for consonant fricative post-alveolar unvoiced " href="../../../phonemes/voiceless_postalveolar_fricative.wav?raw=true">Sample</a>] </td>
275
+ </tr>
276
+ <tr>
277
+ <td> 53 </td>
278
+ <td> ʊ </td>
279
+ <td> vowel near-close near-back rounded [<a title="Audio sample for vowel near-close near-back rounded " href="../../../phonemes/near-close_near-back_rounded_vowel.wav?raw=true">Sample</a>] </td>
280
+ </tr>
281
+ <tr>
282
+ <td> 54 </td>
283
+ <td> ʌ </td>
284
+ <td> vowel open-mid back unrounded [<a title="Audio sample for vowel open-mid back unrounded " href="../../../phonemes/open-mid_back_unrounded_vowel.wav?raw=true">Sample</a>] </td>
285
+ </tr>
286
+ <tr>
287
+ <td> 55 </td>
288
+ <td> ʒ </td>
289
+ <td> consonant fricative post-alveolar voiced [<a title="Audio sample for consonant fricative post-alveolar voiced " href="../../../phonemes/voiced_postalveolar_fricative.wav?raw=true">Sample</a>] </td>
290
+ </tr>
291
+ <tr>
292
+ <td> 56 </td>
293
+ <td> ʔ </td>
294
+ <td> consonant plosive glottal unvoiced [<a title="Audio sample for consonant plosive glottal unvoiced " href="../../../phonemes/glottal_plosive.wav?raw=true">Sample</a>] </td>
295
+ </tr>
296
+ <tr>
297
+ <td> 57 </td>
298
+ <td> ː </td>
299
+ <td> elongation </td>
300
+ </tr>
301
+ <tr>
302
+ <td> 58 </td>
303
+ <td> θ </td>
304
+ <td> consonant fricative dental unvoiced [<a title="Audio sample for consonant fricative dental unvoiced " href="../../../phonemes/voiceless_dental_fricative.wav?raw=true">Sample</a>] </td>
305
+ </tr>
306
+ <tr>
307
+ <td> 59 </td>
308
+ <td> ẽ </td>
309
+ <td> vowel close-mid front unrounded [<a title="Audio sample for vowel close-mid front unrounded " href="../../../phonemes/close-mid_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
310
+ </tr>
311
+ </table>
voices/af_ZA/google-nwu_low/README.md.in ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Afrikaans Google/North West University (Low Quality)
2
+
3
+ A multi-speaker model for Afrikaans based on the [Google/NWU dataset](http://www.openslr.org/32/).
4
+
5
+ See LICENSE file for license.
voices/af_ZA/google-nwu_low/SOURCE ADDED
@@ -0,0 +1 @@
 
 
1
+ http://www.openslr.org/32/
voices/af_ZA/google-nwu_low/VERSION ADDED
@@ -0,0 +1 @@
 
 
1
+ 0.1.0
voices/af_ZA/google-nwu_low/config.json ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "seed": 1234,
3
+ "epochs": 10000,
4
+ "learning_rate": 0.0002,
5
+ "betas": [
6
+ 0.8,
7
+ 0.99
8
+ ],
9
+ "eps": 1e-09,
10
+ "batch_size": 32,
11
+ "fp16_run": true,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8192,
14
+ "init_lr_ratio": 1.0,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0,
18
+ "grad_clip": null,
19
+ "min_seq_length": null,
20
+ "max_seq_length": 400,
21
+ "min_spec_length": null,
22
+ "max_spec_length": null,
23
+ "min_speaker_utterances": null,
24
+ "last_epoch": 1,
25
+ "global_step": 1,
26
+ "best_loss": null,
27
+ "audio": {
28
+ "filter_length": 1024,
29
+ "hop_length": 256,
30
+ "win_length": 1024,
31
+ "mel_channels": 80,
32
+ "sample_rate": 22050,
33
+ "sample_bytes": 2,
34
+ "channels": 1,
35
+ "mel_fmin": 0.0,
36
+ "mel_fmax": null,
37
+ "ref_level_db": 20.0,
38
+ "spec_gain": 1.0,
39
+ "signal_norm": true,
40
+ "min_level_db": -100.0,
41
+ "max_norm": 1.0,
42
+ "clip_norm": true,
43
+ "symmetric_norm": true,
44
+ "do_dynamic_range_compression": true,
45
+ "convert_db_to_amp": true,
46
+ "do_trim_silence": false,
47
+ "trim_silence_db": 40.0,
48
+ "trim_margin_sec": 0.01,
49
+ "trim_keep_sec": 0.25,
50
+ "scale_mels": false
51
+ },
52
+ "model": {
53
+ "num_symbols": 60,
54
+ "n_speakers": 9,
55
+ "inter_channels": 192,
56
+ "hidden_channels": 192,
57
+ "filter_channels": 768,
58
+ "n_heads": 2,
59
+ "n_layers": 6,
60
+ "kernel_size": 3,
61
+ "p_dropout": 0.1,
62
+ "resblock": "2",
63
+ "resblock_kernel_sizes": [
64
+ 3,
65
+ 5,
66
+ 7
67
+ ],
68
+ "resblock_dilation_sizes": [
69
+ [
70
+ 1,
71
+ 2
72
+ ],
73
+ [
74
+ 2,
75
+ 6
76
+ ],
77
+ [
78
+ 3,
79
+ 12
80
+ ]
81
+ ],
82
+ "upsample_rates": [
83
+ 8,
84
+ 8,
85
+ 4
86
+ ],
87
+ "upsample_initial_channel": 256,
88
+ "upsample_kernel_sizes": [
89
+ 16,
90
+ 16,
91
+ 8
92
+ ],
93
+ "n_layers_q": 3,
94
+ "use_spectral_norm": false,
95
+ "gin_channels": 512,
96
+ "use_sdp": true
97
+ },
98
+ "phonemes": {
99
+ "phoneme_separator": "_",
100
+ "word_separator": "#",
101
+ "phoneme_to_id": null,
102
+ "pad": "_",
103
+ "bos": "^",
104
+ "eos": "$",
105
+ "blank": "_",
106
+ "blank_word": "#",
107
+ "blank_between": "tokens_and_words",
108
+ "blank_at_start": true,
109
+ "blank_at_end": true,
110
+ "simple_punctuation": true,
111
+ "punctuation_map": null,
112
+ "separate": [
113
+ "\u02c8",
114
+ "\u02cc"
115
+ ],
116
+ "separate_graphemes": false,
117
+ "separate_tones": false,
118
+ "tone_before": false,
119
+ "phoneme_map": {
120
+ ";": [
121
+ ","
122
+ ],
123
+ ":": [
124
+ ","
125
+ ],
126
+ "!": [
127
+ "."
128
+ ],
129
+ "?": [
130
+ "."
131
+ ]
132
+ },
133
+ "auto_bos_eos": true,
134
+ "minor_break": ",",
135
+ "major_break": ".",
136
+ "break_phonemes_into_graphemes": true,
137
+ "break_phonemes_into_codepoints": false,
138
+ "drop_stress": false,
139
+ "symbols": null
140
+ },
141
+ "text_aligner": {
142
+ "aligner": null,
143
+ "casing": null
144
+ },
145
+ "text_language": "af",
146
+ "phonemizer": "espeak",
147
+ "datasets": [
148
+ {
149
+ "name": "af-za_google",
150
+ "metadata_format": "text",
151
+ "multispeaker": true,
152
+ "text_language": null,
153
+ "audio_dir": "/media/12tb/af-za/google/af_za/za/afr/wavs",
154
+ "cache_dir": "/media/cache/af-za_google"
155
+ }
156
+ ],
157
+ "inference": {
158
+ "length_scale": 1.0,
159
+ "noise_scale": 0.333,
160
+ "noise_w": 0.333,
161
+ "major_break_ms": 250,
162
+ "auto_append_text": "."
163
+ },
164
+ "version": 1,
165
+ "git_commit": ""
166
+ }
voices/af_ZA/google-nwu_low/generator.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8366e03683ea3c15f25ba0163aa1b6e87c1692674cf27b1fb6ba479dbef7e0bb
3
+ size 76351329
voices/af_ZA/google-nwu_low/phonemes.txt ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0 _
2
+ 1 ^
3
+ 2 $
4
+ 3 ,
5
+ 4 .
6
+ 5 #
7
+ 6 ˈ
8
+ 7 ˌ
9
+ 8 -
10
+ 9 a
11
+ 10 b
12
+ 11 c
13
+ 12 d
14
+ 13 e
15
+ 14 f
16
+ 15 h
17
+ 16 i
18
+ 17 j
19
+ 18 k
20
+ 19 l
21
+ 20 m
22
+ 21 n
23
+ 22 o
24
+ 23 p
25
+ 24 r
26
+ 25 s
27
+ 26 t
28
+ 27 u
29
+ 28 v
30
+ 29 w
31
+ 30 x
32
+ 31 y
33
+ 32 z
34
+ 33 æ
35
+ 34 ð
36
+ 35 õ
37
+ 36 ø
38
+ 37 ŋ
39
+ 38 œ
40
+ 39 ɐ
41
+ 40 ɑ
42
+ 41 ɑ̃
43
+ 42 ɒ
44
+ 43 ɔ
45
+ 44 ə
46
+ 45 ɛ
47
+ 46 ɜ
48
+ 47 ɡ
49
+ 48 ɪ
50
+ 49 ɬ
51
+ 50 ɹ
52
+ 51 ʁ
53
+ 52 ʃ
54
+ 53 ʊ
55
+ 54 ʌ
56
+ 55 ʒ
57
+ 56 ʔ
58
+ 57 ː
59
+ 58 θ
60
+ 59 ẽ
voices/af_ZA/google-nwu_low/speaker_map.csv ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ 0|af-za_google|7214
2
+ 1|af-za_google|8963
3
+ 2|af-za_google|7130
4
+ 3|af-za_google|8924
5
+ 4|af-za_google|8148
6
+ 5|af-za_google|1919
7
+ 6|af-za_google|2418
8
+ 7|af-za_google|6590
9
+ 8|af-za_google|0184
voices/af_ZA/google-nwu_low/speakers.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ 7214
2
+ 8963
3
+ 7130
4
+ 8924
5
+ 8148
6
+ 1919
7
+ 2418
8
+ 6590
9
+ 0184
voices/bn/multi_low/ALIASES ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ bn
2
+ bn/multi
voices/bn/multi_low/README.md ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Bengali multi (Low Quality)
2
+
3
+ A multi-speaker model for Bengali based on:
4
+
5
+ * [cmu-indic](http://festvox.org/cmu_indic/)
6
+ * [google](http://www.openslr.org/37/)
7
+
8
+ See LICENSE files for licenses.
9
+
10
+
11
+ ## Phonemes
12
+
13
+ <table><thead><th>&nbsp;</th><th>Phoneme</th><th>Description</th></thead>
14
+ <tr>
15
+ <td> 0 </td>
16
+ <td> _ </td>
17
+ <td> padding </td>
18
+ </tr>
19
+ <tr>
20
+ <td> 1 </td>
21
+ <td> ^ </td>
22
+ <td> start utterance </td>
23
+ </tr>
24
+ <tr>
25
+ <td> 2 </td>
26
+ <td> $ </td>
27
+ <td> end utterance </td>
28
+ </tr>
29
+ <tr>
30
+ <td> 3 </td>
31
+ <td> , </td>
32
+ <td> short pause (minor break) </td>
33
+ </tr>
34
+ <tr>
35
+ <td> 4 </td>
36
+ <td> . </td>
37
+ <td> long pause (major break) </td>
38
+ </tr>
39
+ <tr>
40
+ <td> 5 </td>
41
+ <td> # </td>
42
+ <td> word break </td>
43
+ </tr>
44
+ <tr>
45
+ <td> 6 </td>
46
+ <td> ˈ </td>
47
+ <td> primary stress </td>
48
+ </tr>
49
+ <tr>
50
+ <td> 7 </td>
51
+ <td> ˌ </td>
52
+ <td> secondary stress </td>
53
+ </tr>
54
+ <tr>
55
+ <td> 8 </td>
56
+ <td> a </td>
57
+ <td> vowel open front unrounded [<a title="Audio sample for vowel open front unrounded " href="../../../phonemes/open_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
58
+ </tr>
59
+ <tr>
60
+ <td> 9 </td>
61
+ <td> b </td>
62
+ <td> consonant plosive bilabial voiced [<a title="Audio sample for consonant plosive bilabial voiced " href="../../../phonemes/voiced_bilabial_plosive.wav?raw=true">Sample</a>] </td>
63
+ </tr>
64
+ <tr>
65
+ <td> 10 </td>
66
+ <td> c </td>
67
+ <td> consonant plosive palatal unvoiced [<a title="Audio sample for consonant plosive palatal unvoiced " href="../../../phonemes/voiceless_palatal_plosive.wav?raw=true">Sample</a>] </td>
68
+ </tr>
69
+ <tr>
70
+ <td> 11 </td>
71
+ <td> d </td>
72
+ <td> consonant plosive alveolar voiced [<a title="Audio sample for consonant plosive alveolar voiced " href="../../../phonemes/voiced_alveolar_plosive.wav?raw=true">Sample</a>] </td>
73
+ </tr>
74
+ <tr>
75
+ <td> 12 </td>
76
+ <td> e </td>
77
+ <td> vowel close-mid front unrounded [<a title="Audio sample for vowel close-mid front unrounded " href="../../../phonemes/close-mid_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
78
+ </tr>
79
+ <tr>
80
+ <td> 13 </td>
81
+ <td> f </td>
82
+ <td> consonant fricative labio-dental unvoiced [<a title="Audio sample for consonant fricative labio-dental unvoiced " href="../../../phonemes/voiceless_labiodental_fricative.wav?raw=true">Sample</a>] </td>
83
+ </tr>
84
+ <tr>
85
+ <td> 14 </td>
86
+ <td> h </td>
87
+ <td> consonant fricative glottal unvoiced [<a title="Audio sample for consonant fricative glottal unvoiced " href="../../../phonemes/voiceless_glottal_fricative.wav?raw=true">Sample</a>] </td>
88
+ </tr>
89
+ <tr>
90
+ <td> 15 </td>
91
+ <td> i </td>
92
+ <td> vowel close front unrounded [<a title="Audio sample for vowel close front unrounded " href="../../../phonemes/close_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
93
+ </tr>
94
+ <tr>
95
+ <td> 16 </td>
96
+ <td> j </td>
97
+ <td> consonant approximant palatal voiced [<a title="Audio sample for consonant approximant palatal voiced " href="../../../phonemes/palatal_approximant.wav?raw=true">Sample</a>] </td>
98
+ </tr>
99
+ <tr>
100
+ <td> 17 </td>
101
+ <td> k </td>
102
+ <td> consonant plosive velar unvoiced [<a title="Audio sample for consonant plosive velar unvoiced " href="../../../phonemes/voiceless_velar_plosive.wav?raw=true">Sample</a>] </td>
103
+ </tr>
104
+ <tr>
105
+ <td> 18 </td>
106
+ <td> l </td>
107
+ <td> consonant lateral-approximant alveolar voiced [<a title="Audio sample for consonant lateral-approximant alveolar voiced " href="../../../phonemes/alveolar_lateral_approximant.wav?raw=true">Sample</a>] </td>
108
+ </tr>
109
+ <tr>
110
+ <td> 19 </td>
111
+ <td> m </td>
112
+ <td> consonant nasal bilabial voiced [<a title="Audio sample for consonant nasal bilabial voiced " href="../../../phonemes/bilabial_nasal.wav?raw=true">Sample</a>] </td>
113
+ </tr>
114
+ <tr>
115
+ <td> 20 </td>
116
+ <td> n </td>
117
+ <td> consonant nasal alveolar voiced [<a title="Audio sample for consonant nasal alveolar voiced " href="../../../phonemes/alveolar_nasal.wav?raw=true">Sample</a>] </td>
118
+ </tr>
119
+ <tr>
120
+ <td> 21 </td>
121
+ <td> o </td>
122
+ <td> vowel close-mid back rounded [<a title="Audio sample for vowel close-mid back rounded " href="../../../phonemes/close-mid_back_rounded_vowel.wav?raw=true">Sample</a>] </td>
123
+ </tr>
124
+ <tr>
125
+ <td> 22 </td>
126
+ <td> p </td>
127
+ <td> consonant plosive bilabial unvoiced [<a title="Audio sample for consonant plosive bilabial unvoiced " href="../../../phonemes/voiceless_bilabial_plosive.wav?raw=true">Sample</a>] </td>
128
+ </tr>
129
+ <tr>
130
+ <td> 23 </td>
131
+ <td> p̃ </td>
132
+ <td> consonant plosive bilabial unvoiced [<a title="Audio sample for consonant plosive bilabial unvoiced " href="../../../phonemes/voiceless_bilabial_plosive.wav?raw=true">Sample</a>] </td>
133
+ </tr>
134
+ <tr>
135
+ <td> 24 </td>
136
+ <td> r </td>
137
+ <td> consonant trill alveolar voiced [<a title="Audio sample for consonant trill alveolar voiced " href="../../../phonemes/alveolar_trill.wav?raw=true">Sample</a>] </td>
138
+ </tr>
139
+ <tr>
140
+ <td> 25 </td>
141
+ <td> s </td>
142
+ <td> consonant fricative alveolar unvoiced [<a title="Audio sample for consonant fricative alveolar unvoiced " href="../../../phonemes/voiceless_alveolar_fricative.wav?raw=true">Sample</a>] </td>
143
+ </tr>
144
+ <tr>
145
+ <td> 26 </td>
146
+ <td> t </td>
147
+ <td> consonant plosive alveolar unvoiced [<a title="Audio sample for consonant plosive alveolar unvoiced " href="../../../phonemes/voiceless_alveolar_plosive.wav?raw=true">Sample</a>] </td>
148
+ </tr>
149
+ <tr>
150
+ <td> 27 </td>
151
+ <td> u </td>
152
+ <td> vowel close back rounded [<a title="Audio sample for vowel close back rounded " href="../../../phonemes/close_back_rounded_vowel.wav?raw=true">Sample</a>] </td>
153
+ </tr>
154
+ <tr>
155
+ <td> 28 </td>
156
+ <td> v </td>
157
+ <td> consonant fricative labio-dental voiced [<a title="Audio sample for consonant fricative labio-dental voiced " href="../../../phonemes/voiced_labiodental_fricative.wav?raw=true">Sample</a>] </td>
158
+ </tr>
159
+ <tr>
160
+ <td> 29 </td>
161
+ <td> w </td>
162
+ <td> consonant approximant bilabial voiced [<a title="Audio sample for consonant approximant bilabial voiced " href="../../../phonemes/voiced_bilabial_approximant.wav?raw=true">Sample</a>] </td>
163
+ </tr>
164
+ <tr>
165
+ <td> 30 </td>
166
+ <td> ã </td>
167
+ <td> vowel open front unrounded [<a title="Audio sample for vowel open front unrounded " href="../../../phonemes/open_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
168
+ </tr>
169
+ <tr>
170
+ <td> 31 </td>
171
+ <td> æ </td>
172
+ <td> vowel near-open front unrounded [<a title="Audio sample for vowel near-open front unrounded " href="../../../phonemes/near-open_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
173
+ </tr>
174
+ <tr>
175
+ <td> 32 </td>
176
+ <td> õ </td>
177
+ <td> vowel close-mid back rounded [<a title="Audio sample for vowel close-mid back rounded " href="../../../phonemes/close-mid_back_rounded_vowel.wav?raw=true">Sample</a>] </td>
178
+ </tr>
179
+ <tr>
180
+ <td> 33 </td>
181
+ <td> ĩ </td>
182
+ <td> vowel close front unrounded [<a title="Audio sample for vowel close front unrounded " href="../../../phonemes/close_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
183
+ </tr>
184
+ <tr>
185
+ <td> 34 </td>
186
+ <td> ŋ </td>
187
+ <td> consonant nasal velar voiced [<a title="Audio sample for consonant nasal velar voiced " href="../../../phonemes/velar_nasal.wav?raw=true">Sample</a>] </td>
188
+ </tr>
189
+ <tr>
190
+ <td> 35 </td>
191
+ <td> ũ </td>
192
+ <td> vowel close back rounded [<a title="Audio sample for vowel close back rounded " href="../../../phonemes/close_back_rounded_vowel.wav?raw=true">Sample</a>] </td>
193
+ </tr>
194
+ <tr>
195
+ <td> 36 </td>
196
+ <td> ɐ </td>
197
+ <td> vowel near-open central unrounded [<a title="Audio sample for vowel near-open central unrounded " href="../../../phonemes/near-open_central_unrounded_vowel.wav?raw=true">Sample</a>] </td>
198
+ </tr>
199
+ <tr>
200
+ <td> 37 </td>
201
+ <td> ɑ </td>
202
+ <td> vowel open back unrounded [<a title="Audio sample for vowel open back unrounded " href="../../../phonemes/open_back_unrounded_vowel.wav?raw=true">Sample</a>] </td>
203
+ </tr>
204
+ <tr>
205
+ <td> 38 </td>
206
+ <td> ɒ </td>
207
+ <td> vowel open back rounded [<a title="Audio sample for vowel open back rounded " href="../../../phonemes/open_back_rounded_vowel.wav?raw=true">Sample</a>] </td>
208
+ </tr>
209
+ <tr>
210
+ <td> 39 </td>
211
+ <td> ɔ </td>
212
+ <td> vowel open-mid back rounded [<a title="Audio sample for vowel open-mid back rounded " href="../../../phonemes/open-mid_back_rounded_vowel.wav?raw=true">Sample</a>] </td>
213
+ </tr>
214
+ <tr>
215
+ <td> 40 </td>
216
+ <td> ɔ̃ </td>
217
+ <td> vowel open-mid back rounded [<a title="Audio sample for vowel open-mid back rounded " href="../../../phonemes/open-mid_back_rounded_vowel.wav?raw=true">Sample</a>] </td>
218
+ </tr>
219
+ <tr>
220
+ <td> 41 </td>
221
+ <td> ɖ </td>
222
+ <td> consonant plosive retroflex voiced [<a title="Audio sample for consonant plosive retroflex voiced " href="../../../phonemes/voiced_retroflex_plosive.wav?raw=true">Sample</a>] </td>
223
+ </tr>
224
+ <tr>
225
+ <td> 42 </td>
226
+ <td> ə </td>
227
+ <td> vowel mid central unrounded </td>
228
+ </tr>
229
+ <tr>
230
+ <td> 43 </td>
231
+ <td> ɛ </td>
232
+ <td> vowel open-mid front unrounded [<a title="Audio sample for vowel open-mid front unrounded " href="../../../phonemes/open-mid_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
233
+ </tr>
234
+ <tr>
235
+ <td> 44 </td>
236
+ <td> ɜ </td>
237
+ <td> vowel open-mid central unrounded [<a title="Audio sample for vowel open-mid central unrounded " href="../../../phonemes/open-mid_central_unrounded_vowel.wav?raw=true">Sample</a>] </td>
238
+ </tr>
239
+ <tr>
240
+ <td> 45 </td>
241
+ <td> ɟ </td>
242
+ <td> consonant plosive palatal voiced [<a title="Audio sample for consonant plosive palatal voiced " href="../../../phonemes/voiced_palatal_plosive.wav?raw=true">Sample</a>] </td>
243
+ </tr>
244
+ <tr>
245
+ <td> 46 </td>
246
+ <td> ɡ </td>
247
+ <td> consonant plosive velar voiced [<a title="Audio sample for consonant plosive velar voiced " href="../../../phonemes/voiced_velar_plosive.wav?raw=true">Sample</a>] </td>
248
+ </tr>
249
+ <tr>
250
+ <td> 47 </td>
251
+ <td> ɪ </td>
252
+ <td> vowel near-close near-front unrounded [<a title="Audio sample for vowel near-close near-front unrounded " href="../../../phonemes/near-close_near-front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
253
+ </tr>
254
+ <tr>
255
+ <td> 48 </td>
256
+ <td> ɹ </td>
257
+ <td> consonant approximant alveolar voiced [<a title="Audio sample for consonant approximant alveolar voiced " href="../../../phonemes/alveolar_approximant.wav?raw=true">Sample</a>] </td>
258
+ </tr>
259
+ <tr>
260
+ <td> 49 </td>
261
+ <td> ɾ </td>
262
+ <td> consonant flap alveolar voiced </td>
263
+ </tr>
264
+ <tr>
265
+ <td> 50 </td>
266
+ <td> ʃ </td>
267
+ <td> consonant fricative post-alveolar unvoiced [<a title="Audio sample for consonant fricative post-alveolar unvoiced " href="../../../phonemes/voiceless_postalveolar_fricative.wav?raw=true">Sample</a>] </td>
268
+ </tr>
269
+ <tr>
270
+ <td> 51 </td>
271
+ <td> ʈ </td>
272
+ <td> consonant plosive retroflex unvoiced [<a title="Audio sample for consonant plosive retroflex unvoiced " href="../../../phonemes/voiceless_retroflex_plosive.wav?raw=true">Sample</a>] </td>
273
+ </tr>
274
+ <tr>
275
+ <td> 52 </td>
276
+ <td> ʊ </td>
277
+ <td> vowel near-close near-back rounded [<a title="Audio sample for vowel near-close near-back rounded " href="../../../phonemes/near-close_near-back_rounded_vowel.wav?raw=true">Sample</a>] </td>
278
+ </tr>
279
+ <tr>
280
+ <td> 53 </td>
281
+ <td> ʒ </td>
282
+ <td> consonant fricative post-alveolar voiced [<a title="Audio sample for consonant fricative post-alveolar voiced " href="../../../phonemes/voiced_postalveolar_fricative.wav?raw=true">Sample</a>] </td>
283
+ </tr>
284
+ <tr>
285
+ <td> 54 </td>
286
+ <td> ʰ </td>
287
+ <td> </td>
288
+ </tr>
289
+ <tr>
290
+ <td> 55 </td>
291
+ <td> ː </td>
292
+ <td> elongation </td>
293
+ </tr>
294
+ <tr>
295
+ <td> 56 </td>
296
+ <td> ẽ </td>
297
+ <td> vowel close-mid front unrounded [<a title="Audio sample for vowel close-mid front unrounded " href="../../../phonemes/close-mid_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
298
+ </tr>
299
+ </table>
voices/bn/multi_low/README.md.in ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Bengali multi (Low Quality)
2
+
3
+ A multi-speaker model for Bengali based on:
4
+
5
+ * [cmu-indic](http://festvox.org/cmu_indic/)
6
+ * [google](http://www.openslr.org/37/)
7
+
8
+ See LICENSE files for licenses.
voices/bn/multi_low/SOURCE ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ http://festvox.org/cmu_indic/
2
+ http://www.openslr.org/37/
voices/bn/multi_low/VERSION ADDED
@@ -0,0 +1 @@
 
 
1
+ 0.1.0
voices/bn/multi_low/cmu-indic/LICENSE ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Carnegie Mellon University
2
+ Copyright (c) 2003
3
+ All Rights Reserved.
4
+
5
+ Permission to use, copy, modify, and license this software and its
6
+ documentation for any purpose, is hereby granted without fee,
7
+ subject to the following conditions:
8
+ 1. The code must retain the above copyright notice, this list of
9
+ conditions and the following disclaimer.
10
+ 2. Any modifications must be clearly marked as such.
11
+ 3. Original authors' names are not deleted.
12
+
13
+ THE AUTHORS OF THIS WORK DISCLAIM ALL WARRANTIES WITH REGARD TO
14
+ THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
15
+ AND FITNESS, IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
16
+ SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
17
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
18
+ AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
19
+ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
20
+ THIS SOFTWARE.
voices/bn/multi_low/cmu-indic/SOURCE ADDED
@@ -0,0 +1 @@
 
 
1
+ http://festvox.org/cmu_indic/
voices/bn/multi_low/config.json ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "seed": 1234,
3
+ "epochs": 10000,
4
+ "learning_rate": 0.0002,
5
+ "betas": [
6
+ 0.8,
7
+ 0.99
8
+ ],
9
+ "eps": 1e-09,
10
+ "batch_size": 32,
11
+ "fp16_run": true,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8192,
14
+ "init_lr_ratio": 1.0,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0,
18
+ "grad_clip": null,
19
+ "min_seq_length": null,
20
+ "max_seq_length": 400,
21
+ "min_spec_length": null,
22
+ "max_spec_length": null,
23
+ "min_speaker_utterances": null,
24
+ "last_epoch": 1,
25
+ "global_step": 1,
26
+ "best_loss": null,
27
+ "audio": {
28
+ "filter_length": 1024,
29
+ "hop_length": 256,
30
+ "win_length": 1024,
31
+ "mel_channels": 80,
32
+ "sample_rate": 22050,
33
+ "sample_bytes": 2,
34
+ "channels": 1,
35
+ "mel_fmin": 0,
36
+ "mel_fmax": null,
37
+ "ref_level_db": 20,
38
+ "spec_gain": 1,
39
+ "signal_norm": true,
40
+ "min_level_db": -100,
41
+ "max_norm": 1,
42
+ "clip_norm": true,
43
+ "symmetric_norm": true,
44
+ "do_dynamic_range_compression": true,
45
+ "convert_db_to_amp": true,
46
+ "do_trim_silence": false,
47
+ "trim_silence_db": 40,
48
+ "trim_margin_sec": 0.01,
49
+ "trim_keep_sec": 0.25,
50
+ "scale_mels": false
51
+ },
52
+ "model": {
53
+ "num_symbols": 57,
54
+ "n_speakers": 16,
55
+ "inter_channels": 192,
56
+ "hidden_channels": 192,
57
+ "filter_channels": 768,
58
+ "n_heads": 2,
59
+ "n_layers": 6,
60
+ "kernel_size": 3,
61
+ "p_dropout": 0.1,
62
+ "resblock": "2",
63
+ "resblock_kernel_sizes": [
64
+ 3,
65
+ 5,
66
+ 7
67
+ ],
68
+ "resblock_dilation_sizes": [
69
+ [
70
+ 1,
71
+ 2
72
+ ],
73
+ [
74
+ 2,
75
+ 6
76
+ ],
77
+ [
78
+ 3,
79
+ 12
80
+ ]
81
+ ],
82
+ "upsample_rates": [
83
+ 8,
84
+ 8,
85
+ 4
86
+ ],
87
+ "upsample_initial_channel": 256,
88
+ "upsample_kernel_sizes": [
89
+ 16,
90
+ 16,
91
+ 8
92
+ ],
93
+ "n_layers_q": 3,
94
+ "use_spectral_norm": false,
95
+ "gin_channels": 512,
96
+ "use_sdp": true
97
+ },
98
+ "phonemes": {
99
+ "phoneme_separator": "_",
100
+ "word_separator": "#",
101
+ "phoneme_to_id": null,
102
+ "pad": "_",
103
+ "bos": "^",
104
+ "eos": "$",
105
+ "blank": "_",
106
+ "blank_word": "#",
107
+ "blank_between": "tokens_and_words",
108
+ "blank_at_start": true,
109
+ "blank_at_end": true,
110
+ "simple_punctuation": true,
111
+ "punctuation_map": null,
112
+ "separate": [
113
+ "\u02c8",
114
+ "\u02cc"
115
+ ],
116
+ "separate_graphemes": false,
117
+ "separate_tones": false,
118
+ "tone_before": false,
119
+ "phoneme_map": null,
120
+ "auto_bos_eos": true,
121
+ "minor_break": ",",
122
+ "major_break": ".",
123
+ "break_phonemes_into_graphemes": true,
124
+ "break_phonemes_into_codepoints": false,
125
+ "drop_stress": false,
126
+ "symbols": null
127
+ },
128
+ "text_aligner": {
129
+ "aligner": null,
130
+ "casing": null
131
+ },
132
+ "text_language": "bn",
133
+ "phonemizer": "espeak",
134
+ "datasets": [
135
+ {
136
+ "name": "bn-multi",
137
+ "metadata_format": "text",
138
+ "multispeaker": true,
139
+ "text_language": null,
140
+ "audio_dir": "/media/12tb/bn/google/wavs",
141
+ "cache_dir": "/media/cache/bn-multi"
142
+ }
143
+ ],
144
+ "inference": {
145
+ "length_scale": 1.0,
146
+ "noise_scale": 0.333,
147
+ "noise_w": 0.333,
148
+ "minor_break_ms": 100,
149
+ "major_break_ms": 250,
150
+ "auto_append_text": "."
151
+ },
152
+ "version": 1,
153
+ "git_commit": ""
154
+ }
voices/bn/multi_low/generator.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:115ebb90476abbd2d2828db90b549792e932340d731df885699f2ebdca697ba3
3
+ size 76363361
voices/bn/multi_low/google/LICENSE ADDED
@@ -0,0 +1 @@
 
 
1
+ Attribution-ShareAlike 4.0 (CC BY-SA 4.0)
voices/bn/multi_low/google/SOURCE ADDED
@@ -0,0 +1 @@
 
 
1
+ http://www.openslr.org/37/
voices/bn/multi_low/phoneme_map.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ । .
voices/bn/multi_low/phonemes.txt ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0 _
2
+ 1 ^
3
+ 2 $
4
+ 3 ,
5
+ 4 .
6
+ 5 #
7
+ 6 ˈ
8
+ 7 ˌ
9
+ 8 a
10
+ 9 b
11
+ 10 c
12
+ 11 d
13
+ 12 e
14
+ 13 f
15
+ 14 h
16
+ 15 i
17
+ 16 j
18
+ 17 k
19
+ 18 l
20
+ 19 m
21
+ 20 n
22
+ 21 o
23
+ 22 p
24
+ 23 p̃
25
+ 24 r
26
+ 25 s
27
+ 26 t
28
+ 27 u
29
+ 28 v
30
+ 29 w
31
+ 30 ã
32
+ 31 æ
33
+ 32 õ
34
+ 33 ĩ
35
+ 34 ŋ
36
+ 35 ũ
37
+ 36 ɐ
38
+ 37 ɑ
39
+ 38 ɒ
40
+ 39 ɔ
41
+ 40 ɔ̃
42
+ 41 ɖ
43
+ 42 ə
44
+ 43 ɛ
45
+ 44 ɜ
46
+ 45 ɟ
47
+ 46 ɡ
48
+ 47 ɪ
49
+ 48 ɹ
50
+ 49 ɾ
51
+ 50 ʃ
52
+ 51 ʈ
53
+ 52 ʊ
54
+ 53 ʒ
55
+ 54 ʰ
56
+ 55 ː
57
+ 56 ẽ
voices/bn/multi_low/speaker_map.csv ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0|bn-multi|rm
2
+ 1|bn-multi|03042
3
+ 2|bn-multi|00737
4
+ 3|bn-multi|01232
5
+ 4|bn-multi|02194
6
+ 5|bn-multi|3108
7
+ 6|bn-multi|3713
8
+ 7|bn-multi|1010
9
+ 8|bn-multi|00779
10
+ 9|bn-multi|9169
11
+ 10|bn-multi|4046
12
+ 11|bn-multi|5958
13
+ 12|bn-multi|01701
14
+ 13|bn-multi|4811
15
+ 14|bn-multi|0834
16
+ 15|bn-multi|3958
voices/bn/multi_low/speakers.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ rm
2
+ 03042
3
+ 00737
4
+ 01232
5
+ 02194
6
+ 3108
7
+ 3713
8
+ 1010
9
+ 00779
10
+ 9169
11
+ 4046
12
+ 5958
13
+ 01701
14
+ 4811
15
+ 0834
16
+ 3958
voices/de_DE/m-ailabs_low/ALIASES ADDED
@@ -0,0 +1 @@
 
 
1
+ de_DE/m-ailabs
voices/de_DE/m-ailabs_low/LICENSE ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2017-2019 by the original creators @ M-AILABS with the following license:
2
+
3
+ Redistribution and use in any form, including any commercial use, with or without modification are permitted provided that the following conditions are met:
4
+
5
+ Redistributions of source data must retain the above copyright notice, this list of conditions and the following disclaimer.
6
+ Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this downloaded data, source-code or binary-code without specific prior written permission.
7
+
8
+ THIS DATA IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE and/or DATA, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
voices/de_DE/m-ailabs_low/README.md ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # German M-AILabs (Low Quality)
2
+
3
+ A multi-speaker model for German based on the [M-AILabs dataset](https://www.caito.de/2019/01/03/the-m-ailabs-speech-dataset/).
4
+
5
+ See LICENSE file for license.
6
+
7
+
8
+ ## Phonemes
9
+
10
+ <table><thead><th>&nbsp;</th><th>Phoneme</th><th>Description</th></thead>
11
+ <tr>
12
+ <td> 0 </td>
13
+ <td> _ </td>
14
+ <td> padding </td>
15
+ </tr>
16
+ <tr>
17
+ <td> 1 </td>
18
+ <td> ^ </td>
19
+ <td> start utterance </td>
20
+ </tr>
21
+ <tr>
22
+ <td> 2 </td>
23
+ <td> $ </td>
24
+ <td> end utterance </td>
25
+ </tr>
26
+ <tr>
27
+ <td> 3 </td>
28
+ <td> · </td>
29
+ <td> silence </td>
30
+ </tr>
31
+ <tr>
32
+ <td> 4 </td>
33
+ <td> # </td>
34
+ <td> word break </td>
35
+ </tr>
36
+ <tr>
37
+ <td> 5 </td>
38
+ <td> ˈ </td>
39
+ <td> primary stress </td>
40
+ </tr>
41
+ <tr>
42
+ <td> 6 </td>
43
+ <td> ˌ </td>
44
+ <td> secondary stress </td>
45
+ </tr>
46
+ <tr>
47
+ <td> 7 </td>
48
+ <td> a </td>
49
+ <td> vowel open front unrounded [<a title="Audio sample for vowel open front unrounded " href="../../../phonemes/open_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
50
+ </tr>
51
+ <tr>
52
+ <td> 8 </td>
53
+ <td> aɪ̯ </td>
54
+ <td> dipthong </td>
55
+ </tr>
56
+ <tr>
57
+ <td> 9 </td>
58
+ <td> aʊ̯ </td>
59
+ <td> dipthong </td>
60
+ </tr>
61
+ <tr>
62
+ <td> 10 </td>
63
+ <td> aː </td>
64
+ <td> vowel open front unrounded [<a title="Audio sample for vowel open front unrounded " href="../../../phonemes/open_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
65
+ </tr>
66
+ <tr>
67
+ <td> 11 </td>
68
+ <td> b </td>
69
+ <td> consonant plosive bilabial voiced [<a title="Audio sample for consonant plosive bilabial voiced " href="../../../phonemes/voiced_bilabial_plosive.wav?raw=true">Sample</a>] </td>
70
+ </tr>
71
+ <tr>
72
+ <td> 12 </td>
73
+ <td> d </td>
74
+ <td> consonant plosive alveolar voiced [<a title="Audio sample for consonant plosive alveolar voiced " href="../../../phonemes/voiced_alveolar_plosive.wav?raw=true">Sample</a>] </td>
75
+ </tr>
76
+ <tr>
77
+ <td> 13 </td>
78
+ <td> d͡ʒ </td>
79
+ <td> consonant affricate post-alveolar voiced [<a title="Audio sample for consonant affricate post-alveolar voiced " href="../../../phonemes/voiced_postalveolar_affricate.wav?raw=true">Sample</a>] </td>
80
+ </tr>
81
+ <tr>
82
+ <td> 14 </td>
83
+ <td> eː </td>
84
+ <td> vowel close-mid front unrounded [<a title="Audio sample for vowel close-mid front unrounded " href="../../../phonemes/close-mid_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
85
+ </tr>
86
+ <tr>
87
+ <td> 15 </td>
88
+ <td> f </td>
89
+ <td> consonant fricative labio-dental unvoiced [<a title="Audio sample for consonant fricative labio-dental unvoiced " href="../../../phonemes/voiceless_labiodental_fricative.wav?raw=true">Sample</a>] </td>
90
+ </tr>
91
+ <tr>
92
+ <td> 16 </td>
93
+ <td> g </td>
94
+ <td> consonant plosive velar voiced [<a title="Audio sample for consonant plosive velar voiced " href="../../../phonemes/voiced_velar_plosive.wav?raw=true">Sample</a>] </td>
95
+ </tr>
96
+ <tr>
97
+ <td> 17 </td>
98
+ <td> h </td>
99
+ <td> consonant fricative glottal unvoiced [<a title="Audio sample for consonant fricative glottal unvoiced " href="../../../phonemes/voiceless_glottal_fricative.wav?raw=true">Sample</a>] </td>
100
+ </tr>
101
+ <tr>
102
+ <td> 18 </td>
103
+ <td> iː </td>
104
+ <td> vowel close front unrounded [<a title="Audio sample for vowel close front unrounded " href="../../../phonemes/close_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
105
+ </tr>
106
+ <tr>
107
+ <td> 19 </td>
108
+ <td> j </td>
109
+ <td> consonant approximant palatal voiced [<a title="Audio sample for consonant approximant palatal voiced " href="../../../phonemes/palatal_approximant.wav?raw=true">Sample</a>] </td>
110
+ </tr>
111
+ <tr>
112
+ <td> 20 </td>
113
+ <td> k </td>
114
+ <td> consonant plosive velar unvoiced [<a title="Audio sample for consonant plosive velar unvoiced " href="../../../phonemes/voiceless_velar_plosive.wav?raw=true">Sample</a>] </td>
115
+ </tr>
116
+ <tr>
117
+ <td> 21 </td>
118
+ <td> l </td>
119
+ <td> consonant lateral-approximant alveolar voiced [<a title="Audio sample for consonant lateral-approximant alveolar voiced " href="../../../phonemes/alveolar_lateral_approximant.wav?raw=true">Sample</a>] </td>
120
+ </tr>
121
+ <tr>
122
+ <td> 22 </td>
123
+ <td> m </td>
124
+ <td> consonant nasal bilabial voiced [<a title="Audio sample for consonant nasal bilabial voiced " href="../../../phonemes/bilabial_nasal.wav?raw=true">Sample</a>] </td>
125
+ </tr>
126
+ <tr>
127
+ <td> 23 </td>
128
+ <td> n </td>
129
+ <td> consonant nasal alveolar voiced [<a title="Audio sample for consonant nasal alveolar voiced " href="../../../phonemes/alveolar_nasal.wav?raw=true">Sample</a>] </td>
130
+ </tr>
131
+ <tr>
132
+ <td> 24 </td>
133
+ <td> oː </td>
134
+ <td> vowel close-mid back rounded [<a title="Audio sample for vowel close-mid back rounded " href="../../../phonemes/close-mid_back_rounded_vowel.wav?raw=true">Sample</a>] </td>
135
+ </tr>
136
+ <tr>
137
+ <td> 25 </td>
138
+ <td> p </td>
139
+ <td> consonant plosive bilabial unvoiced [<a title="Audio sample for consonant plosive bilabial unvoiced " href="../../../phonemes/voiceless_bilabial_plosive.wav?raw=true">Sample</a>] </td>
140
+ </tr>
141
+ <tr>
142
+ <td> 26 </td>
143
+ <td> p͡f </td>
144
+ <td> consonant affricate labio-dental unvoiced </td>
145
+ </tr>
146
+ <tr>
147
+ <td> 27 </td>
148
+ <td> s </td>
149
+ <td> consonant fricative alveolar unvoiced [<a title="Audio sample for consonant fricative alveolar unvoiced " href="../../../phonemes/voiceless_alveolar_fricative.wav?raw=true">Sample</a>] </td>
150
+ </tr>
151
+ <tr>
152
+ <td> 28 </td>
153
+ <td> t </td>
154
+ <td> consonant plosive alveolar unvoiced [<a title="Audio sample for consonant plosive alveolar unvoiced " href="../../../phonemes/voiceless_alveolar_plosive.wav?raw=true">Sample</a>] </td>
155
+ </tr>
156
+ <tr>
157
+ <td> 29 </td>
158
+ <td> t͡s </td>
159
+ <td> consonant affricate alveolar unvoiced [<a title="Audio sample for consonant affricate alveolar unvoiced " href="../../../phonemes/voiceless_alveolar_affricate.wav?raw=true">Sample</a>] </td>
160
+ </tr>
161
+ <tr>
162
+ <td> 30 </td>
163
+ <td> t͡ʃ </td>
164
+ <td> consonant affricate post-alveolar unvoiced [<a title="Audio sample for consonant affricate post-alveolar unvoiced " href="../../../phonemes/voiceless_postalveolar_affricate.wav?raw=true">Sample</a>] </td>
165
+ </tr>
166
+ <tr>
167
+ <td> 31 </td>
168
+ <td> uː </td>
169
+ <td> vowel close back rounded [<a title="Audio sample for vowel close back rounded " href="../../../phonemes/close_back_rounded_vowel.wav?raw=true">Sample</a>] </td>
170
+ </tr>
171
+ <tr>
172
+ <td> 32 </td>
173
+ <td> v </td>
174
+ <td> consonant fricative labio-dental voiced [<a title="Audio sample for consonant fricative labio-dental voiced " href="../../../phonemes/voiced_labiodental_fricative.wav?raw=true">Sample</a>] </td>
175
+ </tr>
176
+ <tr>
177
+ <td> 33 </td>
178
+ <td> x </td>
179
+ <td> consonant fricative velar unvoiced [<a title="Audio sample for consonant fricative velar unvoiced " href="../../../phonemes/voiceless_velar_fricative.wav?raw=true">Sample</a>] </td>
180
+ </tr>
181
+ <tr>
182
+ <td> 34 </td>
183
+ <td> yː </td>
184
+ <td> vowel close front rounded [<a title="Audio sample for vowel close front rounded " href="../../../phonemes/close_front_rounded_vowel.wav?raw=true">Sample</a>] </td>
185
+ </tr>
186
+ <tr>
187
+ <td> 35 </td>
188
+ <td> z </td>
189
+ <td> consonant fricative alveolar voiced [<a title="Audio sample for consonant fricative alveolar voiced " href="../../../phonemes/voiced_alveolar_fricative.wav?raw=true">Sample</a>] </td>
190
+ </tr>
191
+ <tr>
192
+ <td> 36 </td>
193
+ <td> ãː </td>
194
+ <td> vowel open front unrounded [<a title="Audio sample for vowel open front unrounded " href="../../../phonemes/open_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
195
+ </tr>
196
+ <tr>
197
+ <td> 37 </td>
198
+ <td> ç </td>
199
+ <td> consonant fricative palatal unvoiced [<a title="Audio sample for consonant fricative palatal unvoiced " href="../../../phonemes/voiceless_palatal_fricative.wav?raw=true">Sample</a>] </td>
200
+ </tr>
201
+ <tr>
202
+ <td> 38 </td>
203
+ <td> õː </td>
204
+ <td> vowel close-mid back rounded [<a title="Audio sample for vowel close-mid back rounded " href="../../../phonemes/close-mid_back_rounded_vowel.wav?raw=true">Sample</a>] </td>
205
+ </tr>
206
+ <tr>
207
+ <td> 39 </td>
208
+ <td> øː </td>
209
+ <td> vowel close-mid front rounded [<a title="Audio sample for vowel close-mid front rounded " href="../../../phonemes/close-mid_front_rounded_vowel.wav?raw=true">Sample</a>] </td>
210
+ </tr>
211
+ <tr>
212
+ <td> 40 </td>
213
+ <td> ŋ </td>
214
+ <td> consonant nasal velar voiced [<a title="Audio sample for consonant nasal velar voiced " href="../../../phonemes/velar_nasal.wav?raw=true">Sample</a>] </td>
215
+ </tr>
216
+ <tr>
217
+ <td> 41 </td>
218
+ <td> œ </td>
219
+ <td> vowel open-mid front rounded [<a title="Audio sample for vowel open-mid front rounded " href="../../../phonemes/open-mid_front_rounded_vowel.wav?raw=true">Sample</a>] </td>
220
+ </tr>
221
+ <tr>
222
+ <td> 42 </td>
223
+ <td> ɐ </td>
224
+ <td> vowel near-open central unrounded [<a title="Audio sample for vowel near-open central unrounded " href="../../../phonemes/near-open_central_unrounded_vowel.wav?raw=true">Sample</a>] </td>
225
+ </tr>
226
+ <tr>
227
+ <td> 43 </td>
228
+ <td> ɔ </td>
229
+ <td> vowel open-mid back rounded [<a title="Audio sample for vowel open-mid back rounded " href="../../../phonemes/open-mid_back_rounded_vowel.wav?raw=true">Sample</a>] </td>
230
+ </tr>
231
+ <tr>
232
+ <td> 44 </td>
233
+ <td> ɔʏ̯ </td>
234
+ <td> dipthong </td>
235
+ </tr>
236
+ <tr>
237
+ <td> 45 </td>
238
+ <td> ə </td>
239
+ <td> vowel mid central unrounded </td>
240
+ </tr>
241
+ <tr>
242
+ <td> 46 </td>
243
+ <td> ɛ </td>
244
+ <td> vowel open-mid front unrounded [<a title="Audio sample for vowel open-mid front unrounded " href="../../../phonemes/open-mid_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
245
+ </tr>
246
+ <tr>
247
+ <td> 47 </td>
248
+ <td> ɛː </td>
249
+ <td> vowel open-mid front unrounded [<a title="Audio sample for vowel open-mid front unrounded " href="../../../phonemes/open-mid_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
250
+ </tr>
251
+ <tr>
252
+ <td> 48 </td>
253
+ <td> ɛ̃ː </td>
254
+ <td> vowel open-mid front unrounded [<a title="Audio sample for vowel open-mid front unrounded " href="../../../phonemes/open-mid_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
255
+ </tr>
256
+ <tr>
257
+ <td> 49 </td>
258
+ <td> ɪ </td>
259
+ <td> vowel near-close near-front unrounded [<a title="Audio sample for vowel near-close near-front unrounded " href="../../../phonemes/near-close_near-front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
260
+ </tr>
261
+ <tr>
262
+ <td> 50 </td>
263
+ <td> ʁ </td>
264
+ <td> consonant fricative uvular voiced [<a title="Audio sample for consonant fricative uvular voiced " href="../../../phonemes/voiced_uvular_fricative.wav?raw=true">Sample</a>] </td>
265
+ </tr>
266
+ <tr>
267
+ <td> 51 </td>
268
+ <td> ʃ </td>
269
+ <td> consonant fricative post-alveolar unvoiced [<a title="Audio sample for consonant fricative post-alveolar unvoiced " href="../../../phonemes/voiceless_postalveolar_fricative.wav?raw=true">Sample</a>] </td>
270
+ </tr>
271
+ <tr>
272
+ <td> 52 </td>
273
+ <td> ʊ </td>
274
+ <td> vowel near-close near-back rounded [<a title="Audio sample for vowel near-close near-back rounded " href="../../../phonemes/near-close_near-back_rounded_vowel.wav?raw=true">Sample</a>] </td>
275
+ </tr>
276
+ <tr>
277
+ <td> 53 </td>
278
+ <td> ʏ </td>
279
+ <td> vowel near-close near-front rounded [<a title="Audio sample for vowel near-close near-front rounded " href="../../../phonemes/near-close_near-front_rounded_vowel.wav?raw=true">Sample</a>] </td>
280
+ </tr>
281
+ <tr>
282
+ <td> 54 </td>
283
+ <td> ʒ </td>
284
+ <td> consonant fricative post-alveolar voiced [<a title="Audio sample for consonant fricative post-alveolar voiced " href="../../../phonemes/voiced_postalveolar_fricative.wav?raw=true">Sample</a>] </td>
285
+ </tr>
286
+ <tr>
287
+ <td> 55 </td>
288
+ <td> ʔ </td>
289
+ <td> consonant plosive glottal unvoiced [<a title="Audio sample for consonant plosive glottal unvoiced " href="../../../phonemes/glottal_plosive.wav?raw=true">Sample</a>] </td>
290
+ </tr>
291
+ <tr>
292
+ <td> 56 </td>
293
+ <td> χ </td>
294
+ <td> consonant fricative uvular unvoiced [<a title="Audio sample for consonant fricative uvular unvoiced " href="../../../phonemes/voiceless_uvular_fricative.wav?raw=true">Sample</a>] </td>
295
+ </tr>
296
+ </table>
voices/de_DE/m-ailabs_low/README.md.in ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # German M-AILabs (Low Quality)
2
+
3
+ A multi-speaker model for German based on the [M-AILabs dataset](https://www.caito.de/2019/01/03/the-m-ailabs-speech-dataset/).
4
+
5
+ See LICENSE file for license.
voices/de_DE/m-ailabs_low/SOURCE ADDED
@@ -0,0 +1 @@
 
 
1
+ https://www.caito.de/2019/01/03/the-m-ailabs-speech-dataset/
voices/de_DE/m-ailabs_low/VERSION ADDED
@@ -0,0 +1 @@
 
 
1
+ 0.1.0
voices/de_DE/m-ailabs_low/config.json ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "seed": 1234,
3
+ "epochs": 10000,
4
+ "learning_rate": 0.0002,
5
+ "betas": [
6
+ 0.8,
7
+ 0.99
8
+ ],
9
+ "eps": 1e-09,
10
+ "batch_size": 32,
11
+ "fp16_run": true,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8192,
14
+ "init_lr_ratio": 1.0,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0,
18
+ "grad_clip": null,
19
+ "min_seq_length": null,
20
+ "max_seq_length": 400,
21
+ "min_spec_length": null,
22
+ "max_spec_length": null,
23
+ "min_speaker_utterances": null,
24
+ "last_epoch": 1,
25
+ "global_step": 1,
26
+ "best_loss": null,
27
+ "audio": {
28
+ "filter_length": 1024,
29
+ "hop_length": 256,
30
+ "win_length": 1024,
31
+ "mel_channels": 80,
32
+ "sample_rate": 22050,
33
+ "sample_bytes": 2,
34
+ "channels": 1,
35
+ "mel_fmin": 0,
36
+ "mel_fmax": null,
37
+ "ref_level_db": 20,
38
+ "spec_gain": 1,
39
+ "signal_norm": true,
40
+ "min_level_db": -100,
41
+ "max_norm": 1,
42
+ "clip_norm": true,
43
+ "symmetric_norm": true,
44
+ "do_dynamic_range_compression": true,
45
+ "convert_db_to_amp": true,
46
+ "do_trim_silence": false,
47
+ "trim_silence_db": 40,
48
+ "trim_margin_sec": 0.01,
49
+ "trim_keep_sec": 0.25,
50
+ "scale_mels": false
51
+ },
52
+ "model": {
53
+ "num_symbols": 57,
54
+ "n_speakers": 5,
55
+ "inter_channels": 192,
56
+ "hidden_channels": 192,
57
+ "filter_channels": 768,
58
+ "n_heads": 2,
59
+ "n_layers": 6,
60
+ "kernel_size": 3,
61
+ "p_dropout": 0.1,
62
+ "resblock": "2",
63
+ "resblock_kernel_sizes": [
64
+ 3,
65
+ 5,
66
+ 7
67
+ ],
68
+ "resblock_dilation_sizes": [
69
+ [
70
+ 1,
71
+ 2
72
+ ],
73
+ [
74
+ 2,
75
+ 6
76
+ ],
77
+ [
78
+ 3,
79
+ 12
80
+ ]
81
+ ],
82
+ "upsample_rates": [
83
+ 8,
84
+ 8,
85
+ 4
86
+ ],
87
+ "upsample_initial_channel": 256,
88
+ "upsample_kernel_sizes": [
89
+ 16,
90
+ 16,
91
+ 8
92
+ ],
93
+ "n_layers_q": 3,
94
+ "use_spectral_norm": false,
95
+ "gin_channels": 512,
96
+ "use_sdp": true
97
+ },
98
+ "phonemes": {
99
+ "phoneme_separator": "_",
100
+ "word_separator": "#",
101
+ "phoneme_to_id": null,
102
+ "pad": "_",
103
+ "bos": "^",
104
+ "eos": "$",
105
+ "blank": "_",
106
+ "blank_word": "#",
107
+ "blank_between": "tokens_and_words",
108
+ "blank_at_start": true,
109
+ "blank_at_end": true,
110
+ "simple_punctuation": true,
111
+ "punctuation_map": null,
112
+ "separate": [
113
+ "\u02c8",
114
+ "\u02cc"
115
+ ],
116
+ "separate_graphemes": false,
117
+ "separate_tones": false,
118
+ "tone_before": false,
119
+ "phoneme_map": null,
120
+ "auto_bos_eos": true,
121
+ "minor_break": "\u00b7",
122
+ "major_break": null,
123
+ "break_phonemes_into_graphemes": false,
124
+ "drop_stress": false,
125
+ "symbols": null
126
+ },
127
+ "text_aligner": {
128
+ "aligner": "kaldi_align",
129
+ "casing": "lower"
130
+ },
131
+ "text_language": "de-de",
132
+ "phonemizer": "gruut",
133
+ "datasets": [
134
+ {
135
+ "name": "m-ailabs",
136
+ "metadata_format": "text",
137
+ "multispeaker": true,
138
+ "text_language": null,
139
+ "audio_dir": "/media/12tb/de-de/m-ai-labs/de_DE",
140
+ "cache_dir": "/media/cache/m-ailabs/de_DE"
141
+ }
142
+ ],
143
+ "inference": {
144
+ "length_scale": 1.0,
145
+ "noise_scale": 0.333,
146
+ "noise_w": 0.333,
147
+ "auto_append_text": "."
148
+ },
149
+ "version": 1,
150
+ "git_commit": ""
151
+ }
voices/de_DE/m-ailabs_low/generator.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3330372429b25fe3a38b10bbe914862a49b2cd0a58da332bbe30fa123035a067
3
+ size 76340831
voices/de_DE/m-ailabs_low/phoneme_map.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ | ·
2
+ ‖ · ·
voices/de_DE/m-ailabs_low/phonemes.txt ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0 _
2
+ 1 ^
3
+ 2 $
4
+ 3 ·
5
+ 4 #
6
+ 5 ˈ
7
+ 6 ˌ
8
+ 7 a
9
+ 8 aɪ̯
10
+ 9 aʊ̯
11
+ 10 aː
12
+ 11 b
13
+ 12 d
14
+ 13 d͡ʒ
15
+ 14 eː
16
+ 15 f
17
+ 16 g
18
+ 17 h
19
+ 18 iː
20
+ 19 j
21
+ 20 k
22
+ 21 l
23
+ 22 m
24
+ 23 n
25
+ 24 oː
26
+ 25 p
27
+ 26 p͡f
28
+ 27 s
29
+ 28 t
30
+ 29 t͡s
31
+ 30 t͡ʃ
32
+ 31 uː
33
+ 32 v
34
+ 33 x
35
+ 34 yː
36
+ 35 z
37
+ 36 ãː
38
+ 37 ç
39
+ 38 õː
40
+ 39 øː
41
+ 40 ŋ
42
+ 41 œ
43
+ 42 ɐ
44
+ 43 ɔ
45
+ 44 ɔʏ̯
46
+ 45 ə
47
+ 46 ɛ
48
+ 47 ɛː
49
+ 48 ɛ̃ː
50
+ 49 ɪ
51
+ 50 ʁ
52
+ 51 ʃ
53
+ 52 ʊ
54
+ 53 ʏ
55
+ 54 ʒ
56
+ 55 ʔ
57
+ 56 χ
voices/de_DE/m-ailabs_low/speaker_map.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ 0|m-ailabs|ramona_deininger
2
+ 1|m-ailabs|karlsson
3
+ 2|m-ailabs|rebecca_braunert_plunkett
4
+ 3|m-ailabs|eva_k
5
+ 4|m-ailabs|angela_merkel
voices/de_DE/m-ailabs_low/speakers.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ ramona_deininger
2
+ karlsson
3
+ rebecca_braunert_plunkett
4
+ eva_k
5
+ angela_merkel
voices/de_DE/thorsten-emotion_low/LICENSE ADDED
@@ -0,0 +1 @@
 
 
1
+ Creative Commons (CC0) Licence
voices/de_DE/thorsten-emotion_low/README.md ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # German Thorsten Emotion (Low Quality)
2
+
3
+ A single-speaker model for German based on the [Thorsten Emotional dataset](http://www.openslr.org/110/).
4
+
5
+ See LICENSE file for license.
6
+
7
+
8
+ ## Phonemes
9
+
10
+ <table><thead><th>&nbsp;</th><th>Phoneme</th><th>Description</th></thead>
11
+ <tr>
12
+ <td> 0 </td>
13
+ <td> _ </td>
14
+ <td> padding </td>
15
+ </tr>
16
+ <tr>
17
+ <td> 1 </td>
18
+ <td> ^ </td>
19
+ <td> start utterance </td>
20
+ </tr>
21
+ <tr>
22
+ <td> 2 </td>
23
+ <td> $ </td>
24
+ <td> end utterance </td>
25
+ </tr>
26
+ <tr>
27
+ <td> 3 </td>
28
+ <td> | </td>
29
+ <td> short pause (minor break) </td>
30
+ </tr>
31
+ <tr>
32
+ <td> 4 </td>
33
+ <td> # </td>
34
+ <td> word break </td>
35
+ </tr>
36
+ <tr>
37
+ <td> 5 </td>
38
+ <td> ˈ </td>
39
+ <td> primary stress </td>
40
+ </tr>
41
+ <tr>
42
+ <td> 6 </td>
43
+ <td> ˌ </td>
44
+ <td> secondary stress </td>
45
+ </tr>
46
+ <tr>
47
+ <td> 7 </td>
48
+ <td> a </td>
49
+ <td> vowel open front unrounded [<a title="Audio sample for vowel open front unrounded " href="../../../phonemes/open_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
50
+ </tr>
51
+ <tr>
52
+ <td> 8 </td>
53
+ <td> aɪ̯ </td>
54
+ <td> dipthong </td>
55
+ </tr>
56
+ <tr>
57
+ <td> 9 </td>
58
+ <td> aʊ̯ </td>
59
+ <td> dipthong </td>
60
+ </tr>
61
+ <tr>
62
+ <td> 10 </td>
63
+ <td> aː </td>
64
+ <td> vowel open front unrounded [<a title="Audio sample for vowel open front unrounded " href="../../../phonemes/open_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
65
+ </tr>
66
+ <tr>
67
+ <td> 11 </td>
68
+ <td> b </td>
69
+ <td> consonant plosive bilabial voiced [<a title="Audio sample for consonant plosive bilabial voiced " href="../../../phonemes/voiced_bilabial_plosive.wav?raw=true">Sample</a>] </td>
70
+ </tr>
71
+ <tr>
72
+ <td> 12 </td>
73
+ <td> d </td>
74
+ <td> consonant plosive alveolar voiced [<a title="Audio sample for consonant plosive alveolar voiced " href="../../../phonemes/voiced_alveolar_plosive.wav?raw=true">Sample</a>] </td>
75
+ </tr>
76
+ <tr>
77
+ <td> 13 </td>
78
+ <td> eː </td>
79
+ <td> vowel close-mid front unrounded [<a title="Audio sample for vowel close-mid front unrounded " href="../../../phonemes/close-mid_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
80
+ </tr>
81
+ <tr>
82
+ <td> 14 </td>
83
+ <td> f </td>
84
+ <td> consonant fricative labio-dental unvoiced [<a title="Audio sample for consonant fricative labio-dental unvoiced " href="../../../phonemes/voiceless_labiodental_fricative.wav?raw=true">Sample</a>] </td>
85
+ </tr>
86
+ <tr>
87
+ <td> 15 </td>
88
+ <td> g </td>
89
+ <td> consonant plosive velar voiced [<a title="Audio sample for consonant plosive velar voiced " href="../../../phonemes/voiced_velar_plosive.wav?raw=true">Sample</a>] </td>
90
+ </tr>
91
+ <tr>
92
+ <td> 16 </td>
93
+ <td> h </td>
94
+ <td> consonant fricative glottal unvoiced [<a title="Audio sample for consonant fricative glottal unvoiced " href="../../../phonemes/voiceless_glottal_fricative.wav?raw=true">Sample</a>] </td>
95
+ </tr>
96
+ <tr>
97
+ <td> 17 </td>
98
+ <td> iː </td>
99
+ <td> vowel close front unrounded [<a title="Audio sample for vowel close front unrounded " href="../../../phonemes/close_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
100
+ </tr>
101
+ <tr>
102
+ <td> 18 </td>
103
+ <td> j </td>
104
+ <td> consonant approximant palatal voiced [<a title="Audio sample for consonant approximant palatal voiced " href="../../../phonemes/palatal_approximant.wav?raw=true">Sample</a>] </td>
105
+ </tr>
106
+ <tr>
107
+ <td> 19 </td>
108
+ <td> k </td>
109
+ <td> consonant plosive velar unvoiced [<a title="Audio sample for consonant plosive velar unvoiced " href="../../../phonemes/voiceless_velar_plosive.wav?raw=true">Sample</a>] </td>
110
+ </tr>
111
+ <tr>
112
+ <td> 20 </td>
113
+ <td> l </td>
114
+ <td> consonant lateral-approximant alveolar voiced [<a title="Audio sample for consonant lateral-approximant alveolar voiced " href="../../../phonemes/alveolar_lateral_approximant.wav?raw=true">Sample</a>] </td>
115
+ </tr>
116
+ <tr>
117
+ <td> 21 </td>
118
+ <td> m </td>
119
+ <td> consonant nasal bilabial voiced [<a title="Audio sample for consonant nasal bilabial voiced " href="../../../phonemes/bilabial_nasal.wav?raw=true">Sample</a>] </td>
120
+ </tr>
121
+ <tr>
122
+ <td> 22 </td>
123
+ <td> n </td>
124
+ <td> consonant nasal alveolar voiced [<a title="Audio sample for consonant nasal alveolar voiced " href="../../../phonemes/alveolar_nasal.wav?raw=true">Sample</a>] </td>
125
+ </tr>
126
+ <tr>
127
+ <td> 23 </td>
128
+ <td> oː </td>
129
+ <td> vowel close-mid back rounded [<a title="Audio sample for vowel close-mid back rounded " href="../../../phonemes/close-mid_back_rounded_vowel.wav?raw=true">Sample</a>] </td>
130
+ </tr>
131
+ <tr>
132
+ <td> 24 </td>
133
+ <td> p </td>
134
+ <td> consonant plosive bilabial unvoiced [<a title="Audio sample for consonant plosive bilabial unvoiced " href="../../../phonemes/voiceless_bilabial_plosive.wav?raw=true">Sample</a>] </td>
135
+ </tr>
136
+ <tr>
137
+ <td> 25 </td>
138
+ <td> p͡f </td>
139
+ <td> consonant affricate labio-dental unvoiced </td>
140
+ </tr>
141
+ <tr>
142
+ <td> 26 </td>
143
+ <td> s </td>
144
+ <td> consonant fricative alveolar unvoiced [<a title="Audio sample for consonant fricative alveolar unvoiced " href="../../../phonemes/voiceless_alveolar_fricative.wav?raw=true">Sample</a>] </td>
145
+ </tr>
146
+ <tr>
147
+ <td> 27 </td>
148
+ <td> t </td>
149
+ <td> consonant plosive alveolar unvoiced [<a title="Audio sample for consonant plosive alveolar unvoiced " href="../../../phonemes/voiceless_alveolar_plosive.wav?raw=true">Sample</a>] </td>
150
+ </tr>
151
+ <tr>
152
+ <td> 28 </td>
153
+ <td> t͡s </td>
154
+ <td> consonant affricate alveolar unvoiced [<a title="Audio sample for consonant affricate alveolar unvoiced " href="../../../phonemes/voiceless_alveolar_affricate.wav?raw=true">Sample</a>] </td>
155
+ </tr>
156
+ <tr>
157
+ <td> 29 </td>
158
+ <td> t͡ʃ </td>
159
+ <td> consonant affricate post-alveolar unvoiced [<a title="Audio sample for consonant affricate post-alveolar unvoiced " href="../../../phonemes/voiceless_postalveolar_affricate.wav?raw=true">Sample</a>] </td>
160
+ </tr>
161
+ <tr>
162
+ <td> 30 </td>
163
+ <td> uː </td>
164
+ <td> vowel close back rounded [<a title="Audio sample for vowel close back rounded " href="../../../phonemes/close_back_rounded_vowel.wav?raw=true">Sample</a>] </td>
165
+ </tr>
166
+ <tr>
167
+ <td> 31 </td>
168
+ <td> v </td>
169
+ <td> consonant fricative labio-dental voiced [<a title="Audio sample for consonant fricative labio-dental voiced " href="../../../phonemes/voiced_labiodental_fricative.wav?raw=true">Sample</a>] </td>
170
+ </tr>
171
+ <tr>
172
+ <td> 32 </td>
173
+ <td> x </td>
174
+ <td> consonant fricative velar unvoiced [<a title="Audio sample for consonant fricative velar unvoiced " href="../../../phonemes/voiceless_velar_fricative.wav?raw=true">Sample</a>] </td>
175
+ </tr>
176
+ <tr>
177
+ <td> 33 </td>
178
+ <td> yː </td>
179
+ <td> vowel close front rounded [<a title="Audio sample for vowel close front rounded " href="../../../phonemes/close_front_rounded_vowel.wav?raw=true">Sample</a>] </td>
180
+ </tr>
181
+ <tr>
182
+ <td> 34 </td>
183
+ <td> z </td>
184
+ <td> consonant fricative alveolar voiced [<a title="Audio sample for consonant fricative alveolar voiced " href="../../../phonemes/voiced_alveolar_fricative.wav?raw=true">Sample</a>] </td>
185
+ </tr>
186
+ <tr>
187
+ <td> 35 </td>
188
+ <td> ãː </td>
189
+ <td> vowel open front unrounded [<a title="Audio sample for vowel open front unrounded " href="../../../phonemes/open_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
190
+ </tr>
191
+ <tr>
192
+ <td> 36 </td>
193
+ <td> ç </td>
194
+ <td> consonant fricative palatal unvoiced [<a title="Audio sample for consonant fricative palatal unvoiced " href="../../../phonemes/voiceless_palatal_fricative.wav?raw=true">Sample</a>] </td>
195
+ </tr>
196
+ <tr>
197
+ <td> 37 </td>
198
+ <td> õː </td>
199
+ <td> vowel close-mid back rounded [<a title="Audio sample for vowel close-mid back rounded " href="../../../phonemes/close-mid_back_rounded_vowel.wav?raw=true">Sample</a>] </td>
200
+ </tr>
201
+ <tr>
202
+ <td> 38 </td>
203
+ <td> øː </td>
204
+ <td> vowel close-mid front rounded [<a title="Audio sample for vowel close-mid front rounded " href="../../../phonemes/close-mid_front_rounded_vowel.wav?raw=true">Sample</a>] </td>
205
+ </tr>
206
+ <tr>
207
+ <td> 39 </td>
208
+ <td> ŋ </td>
209
+ <td> consonant nasal velar voiced [<a title="Audio sample for consonant nasal velar voiced " href="../../../phonemes/velar_nasal.wav?raw=true">Sample</a>] </td>
210
+ </tr>
211
+ <tr>
212
+ <td> 40 </td>
213
+ <td> œ </td>
214
+ <td> vowel open-mid front rounded [<a title="Audio sample for vowel open-mid front rounded " href="../../../phonemes/open-mid_front_rounded_vowel.wav?raw=true">Sample</a>] </td>
215
+ </tr>
216
+ <tr>
217
+ <td> 41 </td>
218
+ <td> ɐ </td>
219
+ <td> vowel near-open central unrounded [<a title="Audio sample for vowel near-open central unrounded " href="../../../phonemes/near-open_central_unrounded_vowel.wav?raw=true">Sample</a>] </td>
220
+ </tr>
221
+ <tr>
222
+ <td> 42 </td>
223
+ <td> ɔ </td>
224
+ <td> vowel open-mid back rounded [<a title="Audio sample for vowel open-mid back rounded " href="../../../phonemes/open-mid_back_rounded_vowel.wav?raw=true">Sample</a>] </td>
225
+ </tr>
226
+ <tr>
227
+ <td> 43 </td>
228
+ <td> ɔʏ̯ </td>
229
+ <td> dipthong </td>
230
+ </tr>
231
+ <tr>
232
+ <td> 44 </td>
233
+ <td> ə </td>
234
+ <td> vowel mid central unrounded </td>
235
+ </tr>
236
+ <tr>
237
+ <td> 45 </td>
238
+ <td> ɛ </td>
239
+ <td> vowel open-mid front unrounded [<a title="Audio sample for vowel open-mid front unrounded " href="../../../phonemes/open-mid_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
240
+ </tr>
241
+ <tr>
242
+ <td> 46 </td>
243
+ <td> ɛː </td>
244
+ <td> vowel open-mid front unrounded [<a title="Audio sample for vowel open-mid front unrounded " href="../../../phonemes/open-mid_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
245
+ </tr>
246
+ <tr>
247
+ <td> 47 </td>
248
+ <td> ɛ̃ː </td>
249
+ <td> vowel open-mid front unrounded [<a title="Audio sample for vowel open-mid front unrounded " href="../../../phonemes/open-mid_front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
250
+ </tr>
251
+ <tr>
252
+ <td> 48 </td>
253
+ <td> ɪ </td>
254
+ <td> vowel near-close near-front unrounded [<a title="Audio sample for vowel near-close near-front unrounded " href="../../../phonemes/near-close_near-front_unrounded_vowel.wav?raw=true">Sample</a>] </td>
255
+ </tr>
256
+ <tr>
257
+ <td> 49 </td>
258
+ <td> ʁ </td>
259
+ <td> consonant fricative uvular voiced [<a title="Audio sample for consonant fricative uvular voiced " href="../../../phonemes/voiced_uvular_fricative.wav?raw=true">Sample</a>] </td>
260
+ </tr>
261
+ <tr>
262
+ <td> 50 </td>
263
+ <td> ʃ </td>
264
+ <td> consonant fricative post-alveolar unvoiced [<a title="Audio sample for consonant fricative post-alveolar unvoiced " href="../../../phonemes/voiceless_postalveolar_fricative.wav?raw=true">Sample</a>] </td>
265
+ </tr>
266
+ <tr>
267
+ <td> 51 </td>
268
+ <td> ʊ </td>
269
+ <td> vowel near-close near-back rounded [<a title="Audio sample for vowel near-close near-back rounded " href="../../../phonemes/near-close_near-back_rounded_vowel.wav?raw=true">Sample</a>] </td>
270
+ </tr>
271
+ <tr>
272
+ <td> 52 </td>
273
+ <td> ʏ </td>
274
+ <td> vowel near-close near-front rounded [<a title="Audio sample for vowel near-close near-front rounded " href="../../../phonemes/near-close_near-front_rounded_vowel.wav?raw=true">Sample</a>] </td>
275
+ </tr>
276
+ <tr>
277
+ <td> 53 </td>
278
+ <td> ʒ </td>
279
+ <td> consonant fricative post-alveolar voiced [<a title="Audio sample for consonant fricative post-alveolar voiced " href="../../../phonemes/voiced_postalveolar_fricative.wav?raw=true">Sample</a>] </td>
280
+ </tr>
281
+ <tr>
282
+ <td> 54 </td>
283
+ <td> ʔ </td>
284
+ <td> consonant plosive glottal unvoiced [<a title="Audio sample for consonant plosive glottal unvoiced " href="../../../phonemes/glottal_plosive.wav?raw=true">Sample</a>] </td>
285
+ </tr>
286
+ <tr>
287
+ <td> 55 </td>
288
+ <td> χ </td>
289
+ <td> consonant fricative uvular unvoiced [<a title="Audio sample for consonant fricative uvular unvoiced " href="../../../phonemes/voiceless_uvular_fricative.wav?raw=true">Sample</a>] </td>
290
+ </tr>
291
+ </table>
voices/de_DE/thorsten-emotion_low/README.md.in ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # German Thorsten Emotion (Low Quality)
2
+
3
+ A single-speaker model for German based on the [Thorsten Emotional dataset](http://www.openslr.org/110/).
4
+
5
+ See LICENSE file for license.
voices/de_DE/thorsten-emotion_low/SOURCE ADDED
@@ -0,0 +1 @@
 
 
1
+ http://www.openslr.org/110/
voices/de_DE/thorsten-emotion_low/VERSION ADDED
@@ -0,0 +1 @@
 
 
1
+ 0.1.0
voices/de_DE/thorsten-emotion_low/config.json ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "seed": 1234,
3
+ "epochs": 10000,
4
+ "learning_rate": 0.0002,
5
+ "betas": [
6
+ 0.8,
7
+ 0.99
8
+ ],
9
+ "eps": 1e-09,
10
+ "batch_size": 32,
11
+ "fp16_run": true,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8192,
14
+ "init_lr_ratio": 1.0,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0,
18
+ "grad_clip": null,
19
+ "min_seq_length": null,
20
+ "max_seq_length": 400,
21
+ "min_spec_length": null,
22
+ "max_spec_length": null,
23
+ "min_speaker_utterances": null,
24
+ "last_epoch": 1,
25
+ "global_step": 1,
26
+ "best_loss": null,
27
+ "audio": {
28
+ "filter_length": 1024,
29
+ "hop_length": 256,
30
+ "win_length": 1024,
31
+ "mel_channels": 80,
32
+ "sample_rate": 22050,
33
+ "sample_bytes": 2,
34
+ "channels": 1,
35
+ "mel_fmin": 0.0,
36
+ "mel_fmax": null,
37
+ "ref_level_db": 20.0,
38
+ "spec_gain": 1.0,
39
+ "signal_norm": true,
40
+ "min_level_db": -100.0,
41
+ "max_norm": 1.0,
42
+ "clip_norm": true,
43
+ "symmetric_norm": true,
44
+ "do_dynamic_range_compression": true,
45
+ "convert_db_to_amp": true,
46
+ "do_trim_silence": false,
47
+ "trim_silence_db": 40.0,
48
+ "trim_margin_sec": 0.01,
49
+ "trim_keep_sec": 0.25,
50
+ "scale_mels": false
51
+ },
52
+ "model": {
53
+ "num_symbols": 56,
54
+ "n_speakers": 8,
55
+ "inter_channels": 192,
56
+ "hidden_channels": 192,
57
+ "filter_channels": 768,
58
+ "n_heads": 2,
59
+ "n_layers": 6,
60
+ "kernel_size": 3,
61
+ "p_dropout": 0.1,
62
+ "resblock": "2",
63
+ "resblock_kernel_sizes": [
64
+ 3,
65
+ 5,
66
+ 7
67
+ ],
68
+ "resblock_dilation_sizes": [
69
+ [
70
+ 1,
71
+ 2
72
+ ],
73
+ [
74
+ 2,
75
+ 6
76
+ ],
77
+ [
78
+ 3,
79
+ 12
80
+ ]
81
+ ],
82
+ "upsample_rates": [
83
+ 8,
84
+ 8,
85
+ 4
86
+ ],
87
+ "upsample_initial_channel": 256,
88
+ "upsample_kernel_sizes": [
89
+ 16,
90
+ 16,
91
+ 8
92
+ ],
93
+ "n_layers_q": 3,
94
+ "use_spectral_norm": false,
95
+ "gin_channels": 512,
96
+ "use_sdp": true
97
+ },
98
+ "phonemes": {
99
+ "phoneme_separator": "_",
100
+ "word_separator": "#",
101
+ "phoneme_to_id": null,
102
+ "pad": "_",
103
+ "bos": "^",
104
+ "eos": "$",
105
+ "blank": "_",
106
+ "blank_word": "#",
107
+ "blank_between": "tokens_and_words",
108
+ "blank_at_start": true,
109
+ "blank_at_end": true,
110
+ "simple_punctuation": true,
111
+ "punctuation_map": null,
112
+ "separate": [
113
+ "\u02c8",
114
+ "\u02cc"
115
+ ],
116
+ "separate_graphemes": false,
117
+ "separate_tones": false,
118
+ "tone_before": false,
119
+ "phoneme_map": {
120
+ "\u2016": [
121
+ "|",
122
+ "|"
123
+ ]
124
+ },
125
+ "auto_bos_eos": true,
126
+ "minor_break": "|",
127
+ "major_break": null,
128
+ "break_phonemes_into_graphemes": false,
129
+ "break_phonemes_into_codepoints": false,
130
+ "drop_stress": false,
131
+ "symbols": null
132
+ },
133
+ "text_aligner": {
134
+ "aligner": null,
135
+ "casing": null
136
+ },
137
+ "text_language": "de_DE",
138
+ "phonemizer": "gruut",
139
+ "datasets": [
140
+ {
141
+ "name": "thorsten_emotion",
142
+ "metadata_format": "text",
143
+ "multispeaker": true,
144
+ "text_language": null,
145
+ "audio_dir": "/media/12tb/de-de/thorsten-emotional_v02",
146
+ "cache_dir": "/media/cache/thorsten_emotion"
147
+ }
148
+ ],
149
+ "inference": {
150
+ "length_scale": 1.0,
151
+ "noise_scale": 0.667,
152
+ "noise_w": 0.8,
153
+ "auto_append_text": "."
154
+ },
155
+ "version": 1,
156
+ "git_commit": ""
157
+ }
voices/de_DE/thorsten-emotion_low/generator.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a2588308d23e51874f6c87dd9651fce2375302f4b26bdb98dfe125547d283a5
3
+ size 76346209
voices/de_DE/thorsten-emotion_low/phonemes.txt ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0 _
2
+ 1 ^
3
+ 2 $
4
+ 3 |
5
+ 4 #
6
+ 5 ˈ
7
+ 6 ˌ
8
+ 7 a
9
+ 8 aɪ̯
10
+ 9 aʊ̯
11
+ 10 aː
12
+ 11 b
13
+ 12 d
14
+ 13 eː
15
+ 14 f
16
+ 15 g
17
+ 16 h
18
+ 17 iː
19
+ 18 j
20
+ 19 k
21
+ 20 l
22
+ 21 m
23
+ 22 n
24
+ 23 oː
25
+ 24 p
26
+ 25 p͡f
27
+ 26 s
28
+ 27 t
29
+ 28 t͡s
30
+ 29 t͡ʃ
31
+ 30 uː
32
+ 31 v
33
+ 32 x
34
+ 33 yː
35
+ 34 z
36
+ 35 ãː
37
+ 36 ç
38
+ 37 õː
39
+ 38 øː
40
+ 39 ŋ
41
+ 40 œ
42
+ 41 ɐ
43
+ 42 ɔ
44
+ 43 ɔʏ̯
45
+ 44 ə
46
+ 45 ɛ
47
+ 46 ɛː
48
+ 47 ɛ̃ː
49
+ 48 ɪ
50
+ 49 ʁ
51
+ 50 ʃ
52
+ 51 ʊ
53
+ 52 ʏ
54
+ 53 ʒ
55
+ 54 ʔ
56
+ 55 χ
voices/de_DE/thorsten-emotion_low/speaker_map.csv ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ 0|thorsten_emotion|amused
2
+ 1|thorsten_emotion|angry
3
+ 2|thorsten_emotion|disgusted
4
+ 3|thorsten_emotion|drunk
5
+ 4|thorsten_emotion|neutral
6
+ 5|thorsten_emotion|sleepy
7
+ 6|thorsten_emotion|surprised
8
+ 7|thorsten_emotion|whisper
voices/de_DE/thorsten-emotion_low/speakers.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ amused
2
+ angry
3
+ disgusted
4
+ drunk
5
+ neutral
6
+ sleepy
7
+ surprised
8
+ whisper
voices/de_DE/thorsten_low/ALIASES ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ de
2
+ de_DE
3
+ thorsten
4
+ de_DE/thorsten