utkarsh2299 commited on
Commit
bc761c1
·
verified ·
1 Parent(s): fb31477

Upload 95 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. environment.yml +127 -0
  3. phone_dict/assamese +0 -0
  4. phone_dict/bengali +0 -0
  5. phone_dict/bodo +0 -0
  6. phone_dict/english +0 -0
  7. phone_dict/gujarati +0 -0
  8. phone_dict/hindi +0 -0
  9. phone_dict/kannada +0 -0
  10. phone_dict/malayalam +0 -0
  11. phone_dict/manipuri +0 -0
  12. phone_dict/marathi +0 -0
  13. phone_dict/odia +0 -0
  14. phone_dict/punjabi +40 -0
  15. phone_dict/rajasthani +0 -0
  16. phone_dict/tamil +0 -0
  17. phone_dict/telugu +0 -0
  18. phone_dict/urdu +0 -0
  19. punjabi/female/model/config.yaml +281 -0
  20. punjabi/female/model/energy_stats.npz +3 -0
  21. punjabi/female/model/feats_stats.npz +3 -0
  22. punjabi/female/model/feats_type +1 -0
  23. punjabi/female/model/model.pth +3 -0
  24. punjabi/female/model/pitch_stats.npz +3 -0
  25. punjabi/male/model/config.yaml +281 -0
  26. punjabi/male/model/energy_stats.npz +3 -0
  27. punjabi/male/model/feats_stats.npz +3 -0
  28. punjabi/male/model/feats_type +1 -0
  29. punjabi/male/model/model.pth +3 -0
  30. punjabi/male/model/pitch_stats.npz +3 -0
  31. punjabi/output.wav +0 -0
  32. rajasthani/female/model/config.yaml +274 -0
  33. rajasthani/female/model/energy_stats.npz +3 -0
  34. rajasthani/female/model/feats_stats.npz +3 -0
  35. rajasthani/female/model/feats_type +1 -0
  36. rajasthani/female/model/model.pth +3 -0
  37. rajasthani/female/model/pitch_stats.npz +3 -0
  38. rajasthani/male/model/config.yaml +279 -0
  39. rajasthani/male/model/energy_stats.npz +3 -0
  40. rajasthani/male/model/feats_stats.npz +3 -0
  41. rajasthani/male/model/feats_type +1 -0
  42. rajasthani/male/model/model.pth +3 -0
  43. rajasthani/male/model/pitch_stats.npz +3 -0
  44. tamil/female/model/config.yaml +266 -0
  45. tamil/female/model/energy_stats.npz +3 -0
  46. tamil/female/model/feats_stats.npz +3 -0
  47. tamil/female/model/feats_type +1 -0
  48. tamil/female/model/model.pth +3 -0
  49. tamil/female/model/pitch_stats.npz +3 -0
  50. tamil/male/model/config.yaml +272 -0
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ vocoder/female/aryan/hifigan/generator filter=lfs diff=lfs merge=lfs -text
37
+ vocoder/female/dravidian/hifigan/generator filter=lfs diff=lfs merge=lfs -text
38
+ vocoder/male/aryan/hifigan/generator filter=lfs diff=lfs merge=lfs -text
39
+ vocoder/male/dravidian/hifigan/generator filter=lfs diff=lfs merge=lfs -text
environment.yml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: tts-mfa-hifigan
2
+ channels:
3
+ - defaults
4
+ dependencies:
5
+ - _libgcc_mutex=0.1=main
6
+ - _openmp_mutex=5.1=1_gnu
7
+ - ca-certificates=2022.10.11=h06a4308_0
8
+ - certifi=2022.9.24=py37h06a4308_0
9
+ - ld_impl_linux-64=2.38=h1181459_1
10
+ - libffi=3.3=he6710b0_2
11
+ - libgcc-ng=11.2.0=h1234567_1
12
+ - libgomp=11.2.0=h1234567_1
13
+ - libstdcxx-ng=11.2.0=h1234567_1
14
+ - ncurses=6.3=h5eee18b_3
15
+ - openssl=1.1.1s=h7f8727e_0
16
+ - pip=22.2.2=py37h06a4308_0
17
+ - python=3.7.15=haa1d7c7_0
18
+ - readline=8.2=h5eee18b_0
19
+ - setuptools=65.5.0=py37h06a4308_0
20
+ - sqlite=3.39.3=h5082296_0
21
+ - tk=8.6.12=h1ccaba5_0
22
+ - wheel=0.37.1=pyhd3eb1b0_0
23
+ - xz=5.2.6=h5eee18b_0
24
+ - zlib=1.2.13=h5eee18b_0
25
+ - pip:
26
+ - aiosignal==1.3.1
27
+ - appdirs==1.4.4
28
+ - attrs==22.1.0
29
+ - audioread==3.0.0
30
+ - backcall==0.2.0
31
+ - cffi==1.15.1
32
+ - charset-normalizer==2.1.1
33
+ - ci-sdr==0.0.2
34
+ - click==8.0.4
35
+ - configargparse==1.5.3
36
+ - ctc-segmentation==1.7.4
37
+ - cycler==0.11.0
38
+ - cython==0.29.32
39
+ - decorator==5.1.1
40
+ - distance==0.1.3
41
+ - distlib==0.3.6
42
+ - docopt==0.6.2
43
+ - einops==0.6.0
44
+ - espnet==202209
45
+ - espnet-tts-frontend==0.0.3
46
+ - fast-bss-eval==0.1.3
47
+ - filelock==3.8.0
48
+ - flask==2.2.2
49
+ - fonttools==4.38.0
50
+ - frozenlist==1.3.3
51
+ - g2p-en==2.1.0
52
+ - grpcio==1.50.0
53
+ - gunicorn==20.1.0
54
+ - h5py==3.7.0
55
+ - humanfriendly==10.0
56
+ - idna==3.4
57
+ - importlib-metadata==4.13.0
58
+ - importlib-resources==5.10.0
59
+ - indic-num2words==1.0.1
60
+ - indic_unified_parser==1.0.6
61
+ - inflect==6.0.2
62
+ - ipython==7.34.0
63
+ - itsdangerous==2.1.2
64
+ - jaconv==0.3
65
+ - jamo==0.4.1
66
+ - jedi==0.18.2
67
+ - jinja2==3.1.2
68
+ - joblib==1.2.0
69
+ - jsonschema==4.17.0
70
+ - kaldiio==2.17.2
71
+ - kiwisolver==1.4.4
72
+ - librosa==0.9.2
73
+ - llvmlite==0.39.1
74
+ - markupsafe==2.1.1
75
+ - matplotlib==3.5.3
76
+ - matplotlib-inline==0.1.6
77
+ - msgpack==1.0.4
78
+ - nltk==3.7
79
+ - numba==0.56.4
80
+ - numpy==1.21.6
81
+ - packaging==21.3
82
+ - pandas==1.3.5
83
+ - parso==0.8.3
84
+ - pexpect==4.8.0
85
+ - pickleshare==0.7.5
86
+ - pillow==9.3.0
87
+ - pkgutil-resolve-name==1.3.10
88
+ - platformdirs==2.5.4
89
+ - pooch==1.6.0
90
+ - prompt-toolkit==3.0.36
91
+ - protobuf==3.20.1
92
+ - ptyprocess==0.7.0
93
+ - pycparser==2.21
94
+ - pydantic==1.10.2
95
+ - pydub==0.25.1
96
+ - pygments==2.14.0
97
+ - pyparsing==3.0.9
98
+ - pypinyin==0.44.0
99
+ - pyrsistent==0.19.2
100
+ - python-dateutil==2.8.2
101
+ - pytorch-wpe==0.0.1
102
+ - pytz==2022.6
103
+ - pyworld==0.3.2
104
+ - pyyaml==6.0
105
+ - ray==2.1.0
106
+ - regex==2022.10.31
107
+ - requests==2.28.1
108
+ - resampy==0.4.2
109
+ - scikit-learn==1.0.2
110
+ - scipy==1.7.3
111
+ - sentencepiece==0.1.97
112
+ - six==1.16.0
113
+ - soundfile==0.11.0
114
+ - threadpoolctl==3.1.0
115
+ - torch-complex==0.4.3
116
+ - tqdm==4.64.1
117
+ - traitlets==5.8.0
118
+ - typeguard==2.13.3
119
+ - typing-extensions==4.4.0
120
+ - unidecode==1.3.6
121
+ - urllib3==1.26.12
122
+ - virtualenv==20.16.7
123
+ - wcwidth==0.2.5
124
+ - webvtt-py==0.4.6
125
+ - werkzeug==2.2.2
126
+ - zipp==3.10.0
127
+ prefix: /speech/Apps/Flask_app_env/conda_dir/envs/tts-mfa-hifigan
phone_dict/assamese ADDED
The diff for this file is too large to render. See raw diff
 
phone_dict/bengali ADDED
The diff for this file is too large to render. See raw diff
 
phone_dict/bodo ADDED
The diff for this file is too large to render. See raw diff
 
phone_dict/english ADDED
The diff for this file is too large to render. See raw diff
 
phone_dict/gujarati ADDED
The diff for this file is too large to render. See raw diff
 
phone_dict/hindi ADDED
The diff for this file is too large to render. See raw diff
 
phone_dict/kannada ADDED
The diff for this file is too large to render. See raw diff
 
phone_dict/malayalam ADDED
The diff for this file is too large to render. See raw diff
 
phone_dict/manipuri ADDED
The diff for this file is too large to render. See raw diff
 
phone_dict/marathi ADDED
The diff for this file is too large to render. See raw diff
 
phone_dict/odia ADDED
The diff for this file is too large to render. See raw diff
 
phone_dict/punjabi ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ਦੀ dI
2
+ ਪਿਛਲੇ piClE
3
+ ਮੁਤਾਬਕ mutAbak
4
+ ਜਾਰੀ jArI
5
+ ਮੌਸਮ mousam
6
+ ਬਰਸਾਤ barsAt
7
+ ਅਜੇ ajE
8
+ ਪੈਣ pऐnx
9
+ ਤੇ tE
10
+ ਵਿੱਚ wiwc
11
+ ਤੋਂ toq
12
+ ਵੀ wI
13
+ ਉੱਤਰ uwtar
14
+ ਜਾ jA
15
+ ਥਾਵਾਂ thAwAq
16
+ ਦੋ do
17
+ ਇਨ੍ਹਾਂ inhAq
18
+ ਭਾਰੀ BArI
19
+ ਬਿਹਾਰ bihAr
20
+ ਹਰਿਆਣਾ hariAnxA
21
+ ਦੌਰ dour
22
+ ਹੈ hऐ
23
+ ਦਾ dA
24
+ ਰਾਜਾਂ rAjAq
25
+ ਪ੍ਰਦੇਸ਼ pradEsank
26
+ ਸਕ sak
27
+ ਪਰ par
28
+ ਕੀਤਾ kItA
29
+ ਘੱਟ घawtx
30
+ ਮੱਧ mawध
31
+ ਵਿਭਾਗ wiBAg
32
+ ਬੇਸ਼ੱਕ bEsankawk
33
+ ਮੀਂਹ mIqh
34
+ ਕੁਝ kuJ
35
+ ਸੰਭਾਵਨਾ sarBAwnA
36
+ ਪੰਜਾਬ parjAb
37
+ ਨਹੀਂ nahIq
38
+ ਦਿਨਾਂ dinAq
39
+ ਇਨਕਾਰ inkAr
40
+ ਹੋਈ hoI
phone_dict/rajasthani ADDED
The diff for this file is too large to render. See raw diff
 
phone_dict/tamil ADDED
The diff for this file is too large to render. See raw diff
 
phone_dict/telugu ADDED
The diff for this file is too large to render. See raw diff
 
phone_dict/urdu ADDED
File without changes
punjabi/female/model/config.yaml ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_fastspeech2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_fastspeech2_raw_char_None
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 8
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 42343
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 1000
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 5
44
+ grad_clip: 1.0
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 8
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: null
53
+ use_tensorboard: true
54
+ use_wandb: false
55
+ wandb_project: null
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param: []
63
+ ignore_init_mismatch: false
64
+ freeze_param: []
65
+ num_iters_per_epoch: 800
66
+ batch_size: 20
67
+ valid_batch_size: null
68
+ batch_bins: 3000000
69
+ valid_batch_bins: null
70
+ train_shape_file:
71
+ - exp/tts_stats_raw_char_None/train/text_shape.char
72
+ - exp/tts_stats_raw_char_None/train/speech_shape
73
+ valid_shape_file:
74
+ - exp/tts_stats_raw_char_None/valid/text_shape.char
75
+ - exp/tts_stats_raw_char_None/valid/speech_shape
76
+ batch_type: numel
77
+ valid_batch_type: null
78
+ fold_length:
79
+ - 150
80
+ - 204800
81
+ sort_in_batch: descending
82
+ sort_batch: descending
83
+ multiple_iterator: false
84
+ chunk_length: 500
85
+ chunk_shift_ratio: 0.5
86
+ num_cache_chunks: 1024
87
+ train_data_path_and_name_and_type:
88
+ - - dump/raw/tr_no_dev/text
89
+ - text
90
+ - text
91
+ - - exp/tts_train_raw_char_None/decode_use_teacher_forcingtrue_train.loss.ave/tr_no_dev/durations
92
+ - durations
93
+ - text_int
94
+ - - dump/raw/tr_no_dev/wav.scp
95
+ - speech
96
+ - sound
97
+ valid_data_path_and_name_and_type:
98
+ - - dump/raw/dev/text
99
+ - text
100
+ - text
101
+ - - exp/tts_train_raw_char_None/decode_use_teacher_forcingtrue_train.loss.ave/dev/durations
102
+ - durations
103
+ - text_int
104
+ - - dump/raw/dev/wav.scp
105
+ - speech
106
+ - sound
107
+ allow_variable_data_keys: false
108
+ max_cache_size: 0.0
109
+ max_cache_fd: 32
110
+ valid_max_cache_size: null
111
+ optim: adam
112
+ optim_conf:
113
+ lr: 1.0
114
+ scheduler: noamlr
115
+ scheduler_conf:
116
+ model_size: 384
117
+ warmup_steps: 4000
118
+ token_list:
119
+ - <blank>
120
+ - <unk>
121
+ - <space>
122
+ - ਾ
123
+ - ੇ
124
+ - ੀ
125
+ - ਰ
126
+ - ਹ
127
+ - ਿ
128
+ - ਦ
129
+ - ਂ
130
+ - ਕ
131
+ - ਸ
132
+ - ਨ
133
+ - ਲ
134
+ - ਤ
135
+ - ਆ
136
+ - ਵ
137
+ - ਮ
138
+ - ੰ
139
+ - ੱ
140
+ - .
141
+ - ੋ
142
+ - ਪ
143
+ - ਗ
144
+ - ਜ
145
+ - ੁ
146
+ - ੂ
147
+ - ਬ
148
+ - ਚ
149
+ - ਉ
150
+ - ਣ
151
+ - ੈ
152
+ - ','
153
+ - ਖ
154
+ - ਇ
155
+ - ਅ
156
+ - ਈ
157
+ - ੜ
158
+ - ਟ
159
+ - ਡ
160
+ - ਸ਼
161
+ - ੍
162
+ - ਫ
163
+ - ਭ
164
+ - ਘ
165
+ - ਏ
166
+ - ਧ
167
+ - ਥ
168
+ - ਛ
169
+ - ਜ਼
170
+ - ਠ
171
+ - ਝ
172
+ - ੌ
173
+ - '?'
174
+ - ਐ
175
+ - ਢ
176
+ - ਼
177
+ - ਓ
178
+ - ਯ
179
+ - '!'
180
+ - ਲ਼
181
+ - ਊ
182
+ - ਗ਼
183
+ - ਫ਼
184
+ - ਔ
185
+ - ਖ਼
186
+ - ਃ
187
+ - ੳ
188
+ - ਞ
189
+ - ੲ
190
+ - ਙ
191
+ - <sos/eos>
192
+ odim: null
193
+ model_conf: {}
194
+ use_preprocessor: true
195
+ token_type: char
196
+ bpemodel: null
197
+ non_linguistic_symbols: null
198
+ cleaner: null
199
+ g2p: g2p_en_no_space
200
+ feats_extract: fbank
201
+ feats_extract_conf:
202
+ n_fft: 1024
203
+ hop_length: 256
204
+ win_length: null
205
+ fs: 22050
206
+ fmin: 0
207
+ fmax: 8000
208
+ n_mels: 80
209
+ normalize: global_mvn
210
+ normalize_conf:
211
+ stats_file: /speech/arun/released_models/tts/female/punjabi/fastspeech2_tf_char/feats_stats.npz
212
+ tts: fastspeech2
213
+ tts_conf:
214
+ adim: 384
215
+ aheads: 2
216
+ elayers: 4
217
+ eunits: 1536
218
+ dlayers: 4
219
+ dunits: 1536
220
+ positionwise_layer_type: conv1d
221
+ positionwise_conv_kernel_size: 3
222
+ duration_predictor_layers: 2
223
+ duration_predictor_chans: 256
224
+ duration_predictor_kernel_size: 3
225
+ postnet_layers: 5
226
+ postnet_filts: 5
227
+ postnet_chans: 256
228
+ use_masking: true
229
+ use_scaled_pos_enc: true
230
+ encoder_normalize_before: true
231
+ decoder_normalize_before: true
232
+ reduction_factor: 1
233
+ init_type: xavier_uniform
234
+ init_enc_alpha: 1.0
235
+ init_dec_alpha: 1.0
236
+ transformer_enc_dropout_rate: 0.2
237
+ transformer_enc_positional_dropout_rate: 0.2
238
+ transformer_enc_attn_dropout_rate: 0.2
239
+ transformer_dec_dropout_rate: 0.2
240
+ transformer_dec_positional_dropout_rate: 0.2
241
+ transformer_dec_attn_dropout_rate: 0.2
242
+ pitch_predictor_layers: 5
243
+ pitch_predictor_chans: 256
244
+ pitch_predictor_kernel_size: 5
245
+ pitch_predictor_dropout: 0.5
246
+ pitch_embed_kernel_size: 1
247
+ pitch_embed_dropout: 0.0
248
+ stop_gradient_from_pitch_predictor: true
249
+ energy_predictor_layers: 2
250
+ energy_predictor_chans: 256
251
+ energy_predictor_kernel_size: 3
252
+ energy_predictor_dropout: 0.5
253
+ energy_embed_kernel_size: 1
254
+ energy_embed_dropout: 0.0
255
+ stop_gradient_from_energy_predictor: false
256
+ pitch_extract: dio
257
+ pitch_extract_conf:
258
+ fs: 22050
259
+ n_fft: 1024
260
+ hop_length: 256
261
+ f0max: 400
262
+ f0min: 80
263
+ reduction_factor: 1
264
+ pitch_normalize: global_mvn
265
+ pitch_normalize_conf:
266
+ stats_file: /speech/arun/released_models/tts/female/punjabi/fastspeech2_tf_char/pitch_stats.npz
267
+ energy_extract: energy
268
+ energy_extract_conf:
269
+ fs: 22050
270
+ n_fft: 1024
271
+ hop_length: 256
272
+ win_length: null
273
+ reduction_factor: 1
274
+ energy_normalize: global_mvn
275
+ energy_normalize_conf:
276
+ stats_file: /speech/arun/released_models/tts/female/punjabi/fastspeech2_tf_char/energy_stats.npz
277
+ required:
278
+ - output_dir
279
+ - token_list
280
+ version: 0.10.3a3
281
+ distributed: true
punjabi/female/model/energy_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd2fe4ec1eb154c2796ef01f6fb615571ea4c1c516555dd0bf5160813e914d34
3
+ size 770
punjabi/female/model/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0fa788f921881f4bf0f38779b1e80afd36397fa3091dc001a5efc89603f10b6
3
+ size 1402
punjabi/female/model/feats_type ADDED
@@ -0,0 +1 @@
 
 
1
+ raw
punjabi/female/model/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:557c40a10ac5f1a6da729b7888fa892c8248f3a25d54eb15afbca53b0a574203
3
+ size 148718075
punjabi/female/model/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cd742bfedeade855aa5b6fe9be805ba22e33111ae1204df4d6c7b8e662dcfee
3
+ size 770
punjabi/male/model/config.yaml ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_grad: 8
2
+ allow_variable_data_keys: false
3
+ batch_bins: 3000000
4
+ batch_size: 20
5
+ batch_type: numel
6
+ best_model_criterion:
7
+ - - valid
8
+ - loss
9
+ - min
10
+ - - train
11
+ - loss
12
+ - min
13
+ bpemodel: null
14
+ chunk_length: 500
15
+ chunk_shift_ratio: 0.5
16
+ cleaner: null
17
+ collect_stats: false
18
+ config: conf/tuning/train_fastspeech2.yaml
19
+ cudnn_benchmark: false
20
+ cudnn_deterministic: true
21
+ cudnn_enabled: true
22
+ detect_anomaly: false
23
+ dist_backend: nccl
24
+ dist_init_method: env://
25
+ dist_launcher: null
26
+ dist_master_addr: localhost
27
+ dist_master_port: 37725
28
+ dist_rank: 0
29
+ dist_world_size: 8
30
+ distributed: true
31
+ dry_run: false
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ energy_extract: energy
37
+ energy_extract_conf:
38
+ fs: 22050
39
+ hop_length: 256
40
+ n_fft: 1024
41
+ reduction_factor: 1
42
+ win_length: null
43
+ energy_normalize: global_mvn
44
+ energy_normalize_conf:
45
+ stats_file: /home/speech/Fastspeech2_HS/punjabi/male/model/energy_stats.npz
46
+ feats_extract: fbank
47
+ feats_extract_conf:
48
+ fmax: 8000
49
+ fmin: 0
50
+ fs: 22050
51
+ hop_length: 256
52
+ n_fft: 1024
53
+ n_mels: 80
54
+ win_length: null
55
+ fold_length:
56
+ - 150
57
+ - 204800
58
+ freeze_param: []
59
+ g2p: g2p_en_no_space
60
+ grad_clip: 1.0
61
+ grad_clip_type: 2.0
62
+ grad_noise: false
63
+ ignore_init_mismatch: false
64
+ init_param: []
65
+ iterator_type: sequence
66
+ keep_nbest_models: 5
67
+ local_rank: 0
68
+ log_interval: null
69
+ log_level: INFO
70
+ max_cache_fd: 32
71
+ max_cache_size: 0.0
72
+ max_epoch: 1000
73
+ model_conf: {}
74
+ multiple_iterator: false
75
+ multiprocessing_distributed: true
76
+ ngpu: 1
77
+ no_forward_run: false
78
+ non_linguistic_symbols: null
79
+ normalize: global_mvn
80
+ normalize_conf:
81
+ stats_file: /home/speech/Fastspeech2_HS/punjabi/male/model/feats_stats.npz
82
+ num_att_plot: 3
83
+ num_cache_chunks: 1024
84
+ num_iters_per_epoch: 800
85
+ num_workers: 1
86
+ odim: null
87
+ optim: adam
88
+ optim_conf:
89
+ lr: 1.0
90
+ output_dir: exp/tts_train_fastspeech2_raw_char_None
91
+ patience: null
92
+ pitch_extract: dio
93
+ pitch_extract_conf:
94
+ f0max: 400
95
+ f0min: 40
96
+ fs: 22050
97
+ hop_length: 256
98
+ n_fft: 1024
99
+ reduction_factor: 1
100
+ pitch_normalize: global_mvn
101
+ pitch_normalize_conf:
102
+ stats_file: /home/speech/Fastspeech2_HS/punjabi/male/model/pitch_stats.npz
103
+ pretrain_path: null
104
+ print_config: false
105
+ required:
106
+ - output_dir
107
+ - token_list
108
+ resume: true
109
+ scheduler: noamlr
110
+ scheduler_conf:
111
+ model_size: 384
112
+ warmup_steps: 4000
113
+ seed: 0
114
+ sharded_ddp: false
115
+ sort_batch: descending
116
+ sort_in_batch: descending
117
+ token_list:
118
+ - <blank>
119
+ - <unk>
120
+ - <space>
121
+ - "\u0A3E"
122
+ - "\u0A47"
123
+ - "\u0A40"
124
+ - "\u0A30"
125
+ - "\u0A39"
126
+ - "\u0A3F"
127
+ - "\u0A26"
128
+ - "\u0A02"
129
+ - "\u0A15"
130
+ - "\u0A38"
131
+ - "\u0A28"
132
+ - "\u0A32"
133
+ - "\u0A24"
134
+ - "\u0A06"
135
+ - "\u0A35"
136
+ - "\u0A2E"
137
+ - "\u0A70"
138
+ - "\u0A71"
139
+ - .
140
+ - "\u0A4B"
141
+ - "\u0A2A"
142
+ - "\u0A17"
143
+ - "\u0A1C"
144
+ - "\u0A41"
145
+ - "\u0A42"
146
+ - "\u0A2C"
147
+ - "\u0A1A"
148
+ - "\u0A09"
149
+ - "\u0A23"
150
+ - "\u0A48"
151
+ - ','
152
+ - "\u0A16"
153
+ - "\u0A07"
154
+ - "\u0A05"
155
+ - "\u0A08"
156
+ - "\u0A5C"
157
+ - "\u0A1F"
158
+ - "\u0A21"
159
+ - "\u0A36"
160
+ - "\u0A4D"
161
+ - "\u0A2B"
162
+ - "\u0A2D"
163
+ - "\u0A18"
164
+ - "\u0A0F"
165
+ - "\u0A27"
166
+ - "\u0A25"
167
+ - "\u0A1B"
168
+ - "\u0A5B"
169
+ - "\u0A20"
170
+ - "\u0A1D"
171
+ - "\u0A4C"
172
+ - '?'
173
+ - "\u0A10"
174
+ - "\u0A22"
175
+ - "\u0A3C"
176
+ - "\u0A13"
177
+ - "\u0A2F"
178
+ - '!'
179
+ - "\u0A33"
180
+ - "\u0A0A"
181
+ - "\u0A5A"
182
+ - "\u0A5E"
183
+ - "\u0A14"
184
+ - "\u0A59"
185
+ - "\u0A03"
186
+ - "\u0A73"
187
+ - "\u0A1E"
188
+ - "\u0A72"
189
+ - "\u0A19"
190
+ - <sos/eos>
191
+ token_type: char
192
+ train_data_path_and_name_and_type:
193
+ - - dump/raw/tr_no_dev/text
194
+ - text
195
+ - text
196
+ - - duration_info_from_teacher/decode_use_teacher_forcingtrue_train.loss.ave/tr_no_dev/durations
197
+ - durations
198
+ - text_int
199
+ - - dump/raw/tr_no_dev/wav.scp
200
+ - speech
201
+ - sound
202
+ train_dtype: float32
203
+ train_shape_file:
204
+ - exp/tts_stats_raw_char_None/train/text_shape.char
205
+ - exp/tts_stats_raw_char_None/train/speech_shape
206
+ tts: fastspeech2
207
+ tts_conf:
208
+ adim: 384
209
+ aheads: 2
210
+ decoder_normalize_before: true
211
+ dlayers: 4
212
+ dunits: 1536
213
+ duration_predictor_chans: 256
214
+ duration_predictor_kernel_size: 3
215
+ duration_predictor_layers: 2
216
+ elayers: 4
217
+ encoder_normalize_before: true
218
+ energy_embed_dropout: 0.0
219
+ energy_embed_kernel_size: 1
220
+ energy_predictor_chans: 256
221
+ energy_predictor_dropout: 0.5
222
+ energy_predictor_kernel_size: 3
223
+ energy_predictor_layers: 2
224
+ eunits: 1536
225
+ init_dec_alpha: 1.0
226
+ init_enc_alpha: 1.0
227
+ init_type: xavier_uniform
228
+ pitch_embed_dropout: 0.0
229
+ pitch_embed_kernel_size: 1
230
+ pitch_predictor_chans: 256
231
+ pitch_predictor_dropout: 0.5
232
+ pitch_predictor_kernel_size: 5
233
+ pitch_predictor_layers: 5
234
+ positionwise_conv_kernel_size: 3
235
+ positionwise_layer_type: conv1d
236
+ postnet_chans: 256
237
+ postnet_filts: 5
238
+ postnet_layers: 5
239
+ reduction_factor: 1
240
+ stop_gradient_from_energy_predictor: false
241
+ stop_gradient_from_pitch_predictor: true
242
+ transformer_dec_attn_dropout_rate: 0.2
243
+ transformer_dec_dropout_rate: 0.2
244
+ transformer_dec_positional_dropout_rate: 0.2
245
+ transformer_enc_attn_dropout_rate: 0.2
246
+ transformer_enc_dropout_rate: 0.2
247
+ transformer_enc_positional_dropout_rate: 0.2
248
+ use_masking: true
249
+ use_scaled_pos_enc: true
250
+ unused_parameters: false
251
+ use_amp: false
252
+ use_preprocessor: true
253
+ use_tensorboard: true
254
+ use_wandb: false
255
+ val_scheduler_criterion:
256
+ - valid
257
+ - loss
258
+ valid_batch_bins: null
259
+ valid_batch_size: null
260
+ valid_batch_type: null
261
+ valid_data_path_and_name_and_type:
262
+ - - dump/raw/dev/text
263
+ - text
264
+ - text
265
+ - - duration_info_from_teacher/decode_use_teacher_forcingtrue_train.loss.ave/dev/durations
266
+ - durations
267
+ - text_int
268
+ - - dump/raw/dev/wav.scp
269
+ - speech
270
+ - sound
271
+ valid_max_cache_size: null
272
+ valid_shape_file:
273
+ - exp/tts_stats_raw_char_None/valid/text_shape.char
274
+ - exp/tts_stats_raw_char_None/valid/speech_shape
275
+ version: 0.10.3a3
276
+ wandb_entity: null
277
+ wandb_id: null
278
+ wandb_model_log_interval: -1
279
+ wandb_name: null
280
+ wandb_project: null
281
+ write_collected_feats: false
punjabi/male/model/energy_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:325fb34d8db2ddda8c4b4d3564227ee9fdc4338272e74c7943eb51567702a2ee
3
+ size 770
punjabi/male/model/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cd21402e685536d58867c03dcb0a83baafc23d26ec3c1f20eaa0211a09351e2
3
+ size 1402
punjabi/male/model/feats_type ADDED
@@ -0,0 +1 @@
 
 
1
+ raw
punjabi/male/model/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04260a1a75a05b97db3e8c23ecb745b96e902fec5760b67d1197a9caccf3e81b
3
+ size 148718067
punjabi/male/model/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a75ab3091c7963b2f8fedeed769c0ddbd30c9106351d678a3a44dc4ad2eb9ffa
3
+ size 770
punjabi/output.wav ADDED
Binary file (753 kB). View file
 
rajasthani/female/model/config.yaml ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_fastspeech2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_fastspeech2_raw_char_None
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 60003
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 1000
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 5
44
+ grad_clip: 1.0
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 8
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: null
53
+ use_tensorboard: true
54
+ use_wandb: false
55
+ wandb_project: null
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param: []
63
+ ignore_init_mismatch: false
64
+ freeze_param: []
65
+ num_iters_per_epoch: 800
66
+ batch_size: 20
67
+ valid_batch_size: null
68
+ batch_bins: 3000000
69
+ valid_batch_bins: null
70
+ train_shape_file:
71
+ - exp/tts_stats_raw_char_None/train/text_shape.char
72
+ - exp/tts_stats_raw_char_None/train/speech_shape
73
+ valid_shape_file:
74
+ - exp/tts_stats_raw_char_None/valid/text_shape.char
75
+ - exp/tts_stats_raw_char_None/valid/speech_shape
76
+ batch_type: numel
77
+ valid_batch_type: null
78
+ fold_length:
79
+ - 150
80
+ - 204800
81
+ sort_in_batch: descending
82
+ sort_batch: descending
83
+ multiple_iterator: false
84
+ chunk_length: 500
85
+ chunk_shift_ratio: 0.5
86
+ num_cache_chunks: 1024
87
+ train_data_path_and_name_and_type:
88
+ - - dump/raw/tr_no_dev/text
89
+ - text
90
+ - text
91
+ - - duration_info/tr_no_dev/durations
92
+ - durations
93
+ - text_int
94
+ - - dump/raw/tr_no_dev/wav.scp
95
+ - speech
96
+ - sound
97
+ - - exp/tts_stats_raw_char_None/train/collect_feats/pitch.scp
98
+ - pitch
99
+ - npy
100
+ - - exp/tts_stats_raw_char_None/train/collect_feats/energy.scp
101
+ - energy
102
+ - npy
103
+ valid_data_path_and_name_and_type:
104
+ - - dump/raw/dev/text
105
+ - text
106
+ - text
107
+ - - duration_info/dev/durations
108
+ - durations
109
+ - text_int
110
+ - - dump/raw/dev/wav.scp
111
+ - speech
112
+ - sound
113
+ - - exp/tts_stats_raw_char_None/valid/collect_feats/pitch.scp
114
+ - pitch
115
+ - npy
116
+ - - exp/tts_stats_raw_char_None/valid/collect_feats/energy.scp
117
+ - energy
118
+ - npy
119
+ allow_variable_data_keys: false
120
+ max_cache_size: 0.0
121
+ max_cache_fd: 32
122
+ valid_max_cache_size: null
123
+ optim: adam
124
+ optim_conf:
125
+ lr: 1.0
126
+ scheduler: noamlr
127
+ scheduler_conf:
128
+ model_size: 384
129
+ warmup_steps: 4000
130
+ token_list:
131
+ - <blank>
132
+ - <unk>
133
+ - A
134
+ - a
135
+ - r
136
+ - ','
137
+ - I
138
+ - q
139
+ - k
140
+ - s
141
+ - E
142
+ - o
143
+ - m
144
+ - n
145
+ - i
146
+ - y
147
+ - t
148
+ - ऐ
149
+ - w
150
+ - j
151
+ - p
152
+ - h
153
+ - l
154
+ - d
155
+ - b
156
+ - ण
157
+ - $
158
+ - .
159
+ - g
160
+ - u
161
+ - U
162
+ - c
163
+ - ख
164
+ - ट
165
+ - D
166
+ - B
167
+ - थ
168
+ - ध
169
+ - ള
170
+ - श
171
+ - औ
172
+ - ठ
173
+ - ड
174
+ - C
175
+ - ष
176
+ - घ
177
+ - P
178
+ - J
179
+ - ढ
180
+ - z
181
+ - f
182
+ - R
183
+ - ऑ
184
+ - <sos/eos>
185
+ odim: null
186
+ model_conf: {}
187
+ use_preprocessor: true
188
+ token_type: char
189
+ bpemodel: null
190
+ non_linguistic_symbols: null
191
+ cleaner: null
192
+ g2p: g2p_en_no_space
193
+ feats_extract: fbank
194
+ feats_extract_conf:
195
+ n_fft: 1024
196
+ hop_length: 256
197
+ win_length: 1024
198
+ fs: 22050
199
+ fmin: 0
200
+ fmax: 8000
201
+ n_mels: 80
202
+ normalize: global_mvn
203
+ normalize_conf:
204
+ stats_file: /speech/arun/released_models/tts/female/rajasthani/fastspeech2_hs/feats_stats.npz
205
+ tts: fastspeech2
206
+ tts_conf:
207
+ adim: 384
208
+ aheads: 2
209
+ elayers: 4
210
+ eunits: 1536
211
+ dlayers: 4
212
+ dunits: 1536
213
+ positionwise_layer_type: conv1d
214
+ positionwise_conv_kernel_size: 3
215
+ duration_predictor_layers: 2
216
+ duration_predictor_chans: 256
217
+ duration_predictor_kernel_size: 3
218
+ postnet_layers: 5
219
+ postnet_filts: 5
220
+ postnet_chans: 256
221
+ use_masking: true
222
+ use_scaled_pos_enc: true
223
+ encoder_normalize_before: true
224
+ decoder_normalize_before: true
225
+ reduction_factor: 1
226
+ init_type: xavier_uniform
227
+ init_enc_alpha: 1.0
228
+ init_dec_alpha: 1.0
229
+ transformer_enc_dropout_rate: 0.2
230
+ transformer_enc_positional_dropout_rate: 0.2
231
+ transformer_enc_attn_dropout_rate: 0.2
232
+ transformer_dec_dropout_rate: 0.2
233
+ transformer_dec_positional_dropout_rate: 0.2
234
+ transformer_dec_attn_dropout_rate: 0.2
235
+ pitch_predictor_layers: 5
236
+ pitch_predictor_chans: 256
237
+ pitch_predictor_kernel_size: 5
238
+ pitch_predictor_dropout: 0.5
239
+ pitch_embed_kernel_size: 1
240
+ pitch_embed_dropout: 0.0
241
+ stop_gradient_from_pitch_predictor: true
242
+ energy_predictor_layers: 2
243
+ energy_predictor_chans: 256
244
+ energy_predictor_kernel_size: 3
245
+ energy_predictor_dropout: 0.5
246
+ energy_embed_kernel_size: 1
247
+ energy_embed_dropout: 0.0
248
+ stop_gradient_from_energy_predictor: false
249
+ pitch_extract: dio
250
+ pitch_extract_conf:
251
+ fs: 22050
252
+ n_fft: 1024
253
+ hop_length: 256
254
+ f0max: 400
255
+ f0min: 80
256
+ reduction_factor: 1
257
+ pitch_normalize: global_mvn
258
+ pitch_normalize_conf:
259
+ stats_file: /speech/arun/released_models/tts/female/rajasthani/fastspeech2_hs/pitch_stats.npz
260
+ energy_extract: energy
261
+ energy_extract_conf:
262
+ fs: 22050
263
+ n_fft: 1024
264
+ hop_length: 256
265
+ win_length: 1024
266
+ reduction_factor: 1
267
+ energy_normalize: global_mvn
268
+ energy_normalize_conf:
269
+ stats_file: /speech/arun/released_models/tts/female/rajasthani/fastspeech2_hs/energy_stats.npz
270
+ required:
271
+ - output_dir
272
+ - token_list
273
+ version: 0.10.3a3
274
+ distributed: true
rajasthani/female/model/energy_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92802cda100723b4dbd0a19a79f6df9f373d19425179c3d5304ad23af3247202
3
+ size 770
rajasthani/female/model/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42ece0e29e4c2bca55edd5c3f23c22634312260c053bc73df657820e9f434790
3
+ size 1402
rajasthani/female/model/feats_type ADDED
@@ -0,0 +1 @@
 
 
1
+ raw
rajasthani/female/model/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddce9f7517bb41a5fac33c7dcdd5a2de6413b8e12e4fb6278f27f12e3ff2cda4
3
+ size 148688878
rajasthani/female/model/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69bcb1b0a0d2f083545534fce050d081548ddc7d7479ca34c6d853d652be1c43
3
+ size 770
rajasthani/male/model/config.yaml ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_fastspeech2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_fastspeech2_raw_char_None
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 43719
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 1000
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 5
44
+ grad_clip: 1.0
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 8
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: null
53
+ use_tensorboard: true
54
+ use_wandb: false
55
+ wandb_project: null
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param: []
63
+ ignore_init_mismatch: false
64
+ freeze_param: []
65
+ num_iters_per_epoch: 800
66
+ batch_size: 20
67
+ valid_batch_size: null
68
+ batch_bins: 3000000
69
+ valid_batch_bins: null
70
+ train_shape_file:
71
+ - exp/tts_stats_raw_char_None/train/text_shape.char
72
+ - exp/tts_stats_raw_char_None/train/speech_shape
73
+ valid_shape_file:
74
+ - exp/tts_stats_raw_char_None/valid/text_shape.char
75
+ - exp/tts_stats_raw_char_None/valid/speech_shape
76
+ batch_type: numel
77
+ valid_batch_type: null
78
+ fold_length:
79
+ - 150
80
+ - 204800
81
+ sort_in_batch: descending
82
+ sort_batch: descending
83
+ multiple_iterator: false
84
+ chunk_length: 500
85
+ chunk_shift_ratio: 0.5
86
+ num_cache_chunks: 1024
87
+ train_data_path_and_name_and_type:
88
+ - - dump/raw/tr_no_dev/text
89
+ - text
90
+ - text
91
+ - - duration_info/tr_no_dev/durations
92
+ - durations
93
+ - text_int
94
+ - - dump/raw/tr_no_dev/wav.scp
95
+ - speech
96
+ - sound
97
+ - - exp/tts_stats_raw_char_None/train/collect_feats/pitch.scp
98
+ - pitch
99
+ - npy
100
+ - - exp/tts_stats_raw_char_None/train/collect_feats/energy.scp
101
+ - energy
102
+ - npy
103
+ valid_data_path_and_name_and_type:
104
+ - - dump/raw/dev/text
105
+ - text
106
+ - text
107
+ - - duration_info/dev/durations
108
+ - durations
109
+ - text_int
110
+ - - dump/raw/dev/wav.scp
111
+ - speech
112
+ - sound
113
+ - - exp/tts_stats_raw_char_None/valid/collect_feats/pitch.scp
114
+ - pitch
115
+ - npy
116
+ - - exp/tts_stats_raw_char_None/valid/collect_feats/energy.scp
117
+ - energy
118
+ - npy
119
+ allow_variable_data_keys: false
120
+ max_cache_size: 0.0
121
+ max_cache_fd: 32
122
+ valid_max_cache_size: null
123
+ optim: adam
124
+ optim_conf:
125
+ lr: 1.0
126
+ scheduler: noamlr
127
+ scheduler_conf:
128
+ model_size: 384
129
+ warmup_steps: 4000
130
+ token_list:
131
+ - <blank>
132
+ - <unk>
133
+ - A
134
+ - a
135
+ - r
136
+ - I
137
+ - ','
138
+ - q
139
+ - o
140
+ - k
141
+ - s
142
+ - ऐ
143
+ - m
144
+ - E
145
+ - i
146
+ - n
147
+ - y
148
+ - t
149
+ - w
150
+ - j
151
+ - h
152
+ - l
153
+ - p
154
+ - d
155
+ - ण
156
+ - $
157
+ - .
158
+ - b
159
+ - g
160
+ - U
161
+ - u
162
+ - c
163
+ - ख
164
+ - D
165
+ - ट
166
+ - B
167
+ - ध
168
+ - थ
169
+ - ള
170
+ - ठ
171
+ - औ
172
+ - घ
173
+ - श
174
+ - ड
175
+ - C
176
+ - P
177
+ - ष
178
+ - J
179
+ - T
180
+ - ढ
181
+ - z
182
+ - R
183
+ - ञ
184
+ - f
185
+ - ऑ
186
+ - M
187
+ - H
188
+ - क
189
+ - <sos/eos>
190
+ odim: null
191
+ model_conf: {}
192
+ use_preprocessor: true
193
+ token_type: char
194
+ bpemodel: null
195
+ non_linguistic_symbols: null
196
+ cleaner: null
197
+ g2p: g2p_en_no_space
198
+ feats_extract: fbank
199
+ feats_extract_conf:
200
+ n_fft: 1024
201
+ hop_length: 256
202
+ win_length: 1024
203
+ fs: 22050
204
+ fmin: 0
205
+ fmax: 8000
206
+ n_mels: 80
207
+ normalize: global_mvn
208
+ normalize_conf:
209
+ stats_file: /speech/arun/released_models/tts/male/rajasthani/fastspeech2_hs/feats_stats.npz
210
+ tts: fastspeech2
211
+ tts_conf:
212
+ adim: 384
213
+ aheads: 2
214
+ elayers: 4
215
+ eunits: 1536
216
+ dlayers: 4
217
+ dunits: 1536
218
+ positionwise_layer_type: conv1d
219
+ positionwise_conv_kernel_size: 3
220
+ duration_predictor_layers: 2
221
+ duration_predictor_chans: 256
222
+ duration_predictor_kernel_size: 3
223
+ postnet_layers: 5
224
+ postnet_filts: 5
225
+ postnet_chans: 256
226
+ use_masking: true
227
+ use_scaled_pos_enc: true
228
+ encoder_normalize_before: true
229
+ decoder_normalize_before: true
230
+ reduction_factor: 1
231
+ init_type: xavier_uniform
232
+ init_enc_alpha: 1.0
233
+ init_dec_alpha: 1.0
234
+ transformer_enc_dropout_rate: 0.2
235
+ transformer_enc_positional_dropout_rate: 0.2
236
+ transformer_enc_attn_dropout_rate: 0.2
237
+ transformer_dec_dropout_rate: 0.2
238
+ transformer_dec_positional_dropout_rate: 0.2
239
+ transformer_dec_attn_dropout_rate: 0.2
240
+ pitch_predictor_layers: 5
241
+ pitch_predictor_chans: 256
242
+ pitch_predictor_kernel_size: 5
243
+ pitch_predictor_dropout: 0.5
244
+ pitch_embed_kernel_size: 1
245
+ pitch_embed_dropout: 0.0
246
+ stop_gradient_from_pitch_predictor: true
247
+ energy_predictor_layers: 2
248
+ energy_predictor_chans: 256
249
+ energy_predictor_kernel_size: 3
250
+ energy_predictor_dropout: 0.5
251
+ energy_embed_kernel_size: 1
252
+ energy_embed_dropout: 0.0
253
+ stop_gradient_from_energy_predictor: false
254
+ pitch_extract: dio
255
+ pitch_extract_conf:
256
+ fs: 22050
257
+ n_fft: 1024
258
+ hop_length: 256
259
+ f0max: 350
260
+ f0min: 40
261
+ reduction_factor: 1
262
+ pitch_normalize: global_mvn
263
+ pitch_normalize_conf:
264
+ stats_file: /speech/arun/released_models/tts/male/rajasthani/fastspeech2_hs/pitch_stats.npz
265
+ energy_extract: energy
266
+ energy_extract_conf:
267
+ fs: 22050
268
+ n_fft: 1024
269
+ hop_length: 256
270
+ win_length: 1024
271
+ reduction_factor: 1
272
+ energy_normalize: global_mvn
273
+ energy_normalize_conf:
274
+ stats_file: /speech/arun/released_models/tts/male/rajasthani/fastspeech2_hs/energy_stats.npz
275
+ required:
276
+ - output_dir
277
+ - token_list
278
+ version: 0.10.3a3
279
+ distributed: true
rajasthani/male/model/energy_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de51706233506aafe9be85372f894ab3880065c9ece3d8006c203638567f12aa
3
+ size 770
rajasthani/male/model/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a23e2b255fed48edce9d6135410580eeeb4b965d6a76e814856cee6bed9ddace
3
+ size 1402
rajasthani/male/model/feats_type ADDED
@@ -0,0 +1 @@
 
 
1
+ raw
rajasthani/male/model/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:396e90e77e55643b1702345e1f2ffcaa95bc20243d94e64c1ffaef1484406678
3
+ size 148696483
rajasthani/male/model/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bb03e1a0f02a3f1a0f41fdffc01fdfad686ba393a5613f18961b1ac861b47b8
3
+ size 770
tamil/female/model/config.yaml ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_fastspeech2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_fastspeech2_raw_char_None
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 46011
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 1000
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 5
44
+ grad_clip: 1.0
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 8
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: null
53
+ use_tensorboard: true
54
+ use_wandb: false
55
+ wandb_project: null
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param: []
63
+ ignore_init_mismatch: false
64
+ freeze_param: []
65
+ num_iters_per_epoch: 800
66
+ batch_size: 20
67
+ valid_batch_size: null
68
+ batch_bins: 3000000
69
+ valid_batch_bins: null
70
+ train_shape_file:
71
+ - exp/tts_stats_raw_char_None/train/text_shape.char
72
+ - exp/tts_stats_raw_char_None/train/speech_shape
73
+ valid_shape_file:
74
+ - exp/tts_stats_raw_char_None/valid/text_shape.char
75
+ - exp/tts_stats_raw_char_None/valid/speech_shape
76
+ batch_type: numel
77
+ valid_batch_type: null
78
+ fold_length:
79
+ - 150
80
+ - 204800
81
+ sort_in_batch: descending
82
+ sort_batch: descending
83
+ multiple_iterator: false
84
+ chunk_length: 500
85
+ chunk_shift_ratio: 0.5
86
+ num_cache_chunks: 1024
87
+ train_data_path_and_name_and_type:
88
+ - - dump/raw/tr_no_dev/text
89
+ - text
90
+ - text
91
+ - - duration_info/tr_no_dev/durations
92
+ - durations
93
+ - text_int
94
+ - - dump/raw/tr_no_dev/wav.scp
95
+ - speech
96
+ - sound
97
+ - - exp/tts_stats_raw_char_None/train/collect_feats/pitch.scp
98
+ - pitch
99
+ - npy
100
+ - - exp/tts_stats_raw_char_None/train/collect_feats/energy.scp
101
+ - energy
102
+ - npy
103
+ valid_data_path_and_name_and_type:
104
+ - - dump/raw/dev/text
105
+ - text
106
+ - text
107
+ - - duration_info/dev/durations
108
+ - durations
109
+ - text_int
110
+ - - dump/raw/dev/wav.scp
111
+ - speech
112
+ - sound
113
+ - - exp/tts_stats_raw_char_None/valid/collect_feats/pitch.scp
114
+ - pitch
115
+ - npy
116
+ - - exp/tts_stats_raw_char_None/valid/collect_feats/energy.scp
117
+ - energy
118
+ - npy
119
+ allow_variable_data_keys: false
120
+ max_cache_size: 0.0
121
+ max_cache_fd: 32
122
+ valid_max_cache_size: null
123
+ optim: adam
124
+ optim_conf:
125
+ lr: 1.0
126
+ scheduler: noamlr
127
+ scheduler_conf:
128
+ model_size: 384
129
+ warmup_steps: 4000
130
+ token_list:
131
+ - <blank>
132
+ - <unk>
133
+ - a
134
+ - i
135
+ - ','
136
+ - k
137
+ - A
138
+ - r
139
+ - u
140
+ - m
141
+ - n
142
+ - t
143
+ - p
144
+ - w
145
+ - d
146
+ - l
147
+ - உ
148
+ - y
149
+ - g
150
+ - ऐ
151
+ - ट
152
+ - न
153
+ - ड
154
+ - र
155
+ - ള
156
+ - s
157
+ - e
158
+ - E
159
+ - ण
160
+ - O
161
+ - o
162
+ - $
163
+ - .
164
+ - ङ
165
+ - b
166
+ - Z
167
+ - U
168
+ - I
169
+ - c
170
+ - j
171
+ - ष
172
+ - ञ
173
+ - h
174
+ - f
175
+ - औ
176
+ - <sos/eos>
177
+ odim: null
178
+ model_conf: {}
179
+ use_preprocessor: true
180
+ token_type: char
181
+ bpemodel: null
182
+ non_linguistic_symbols: null
183
+ cleaner: null
184
+ g2p: g2p_en_no_space
185
+ feats_extract: fbank
186
+ feats_extract_conf:
187
+ n_fft: 1024
188
+ hop_length: 256
189
+ win_length: 1024
190
+ fs: 22050
191
+ fmin: 0
192
+ fmax: 8000
193
+ n_mels: 80
194
+ normalize: global_mvn
195
+ normalize_conf:
196
+ stats_file: /speech/arun/released_models/tts/female/tamil/fastspeech2_hs/feats_stats.npz
197
+ tts: fastspeech2
198
+ tts_conf:
199
+ adim: 384
200
+ aheads: 2
201
+ elayers: 4
202
+ eunits: 1536
203
+ dlayers: 4
204
+ dunits: 1536
205
+ positionwise_layer_type: conv1d
206
+ positionwise_conv_kernel_size: 3
207
+ duration_predictor_layers: 2
208
+ duration_predictor_chans: 256
209
+ duration_predictor_kernel_size: 3
210
+ postnet_layers: 5
211
+ postnet_filts: 5
212
+ postnet_chans: 256
213
+ use_masking: true
214
+ use_scaled_pos_enc: true
215
+ encoder_normalize_before: true
216
+ decoder_normalize_before: true
217
+ reduction_factor: 1
218
+ init_type: xavier_uniform
219
+ init_enc_alpha: 1.0
220
+ init_dec_alpha: 1.0
221
+ transformer_enc_dropout_rate: 0.2
222
+ transformer_enc_positional_dropout_rate: 0.2
223
+ transformer_enc_attn_dropout_rate: 0.2
224
+ transformer_dec_dropout_rate: 0.2
225
+ transformer_dec_positional_dropout_rate: 0.2
226
+ transformer_dec_attn_dropout_rate: 0.2
227
+ pitch_predictor_layers: 5
228
+ pitch_predictor_chans: 256
229
+ pitch_predictor_kernel_size: 5
230
+ pitch_predictor_dropout: 0.5
231
+ pitch_embed_kernel_size: 1
232
+ pitch_embed_dropout: 0.0
233
+ stop_gradient_from_pitch_predictor: true
234
+ energy_predictor_layers: 2
235
+ energy_predictor_chans: 256
236
+ energy_predictor_kernel_size: 3
237
+ energy_predictor_dropout: 0.5
238
+ energy_embed_kernel_size: 1
239
+ energy_embed_dropout: 0.0
240
+ stop_gradient_from_energy_predictor: false
241
+ pitch_extract: dio
242
+ pitch_extract_conf:
243
+ fs: 22050
244
+ n_fft: 1024
245
+ hop_length: 256
246
+ f0max: 400
247
+ f0min: 80
248
+ reduction_factor: 1
249
+ pitch_normalize: global_mvn
250
+ pitch_normalize_conf:
251
+ stats_file: /speech/arun/released_models/tts/female/tamil/fastspeech2_hs/pitch_stats.npz
252
+ energy_extract: energy
253
+ energy_extract_conf:
254
+ fs: 22050
255
+ n_fft: 1024
256
+ hop_length: 256
257
+ win_length: 1024
258
+ reduction_factor: 1
259
+ energy_normalize: global_mvn
260
+ energy_normalize_conf:
261
+ stats_file: /speech/arun/released_models/tts/female/tamil/fastspeech2_hs/energy_stats.npz
262
+ required:
263
+ - output_dir
264
+ - token_list
265
+ version: 0.10.3a3
266
+ distributed: true
tamil/female/model/energy_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd0aed3910609b6d14784a1ed884d2c4256283fc3102f3f0907f51d44905bc15
3
+ size 770
tamil/female/model/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24a87e91f86da65483b0eee8e98abe8264019a133770a65a9fd3228d0ea7f277
3
+ size 1402
tamil/female/model/feats_type ADDED
@@ -0,0 +1 @@
 
 
1
+ raw
tamil/female/model/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:333232faa4ba548952eb9668f47bfb580cbe079643807d42113682ea887eade1
3
+ size 148676597
tamil/female/model/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e01a98009365f40cad2d81a25dcbfd0a4194e99c700907da886cdca94229a214
3
+ size 770
tamil/male/model/config.yaml ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accum_grad: 8
2
+ allow_variable_data_keys: false
3
+ batch_bins: 3000000
4
+ batch_size: 20
5
+ batch_type: numel
6
+ best_model_criterion:
7
+ - - valid
8
+ - loss
9
+ - min
10
+ - - train
11
+ - loss
12
+ - min
13
+ bpemodel: null
14
+ chunk_length: 500
15
+ chunk_shift_ratio: 0.5
16
+ cleaner: null
17
+ collect_stats: false
18
+ config: conf/tuning/train_fastspeech2.yaml
19
+ cudnn_benchmark: false
20
+ cudnn_deterministic: true
21
+ cudnn_enabled: true
22
+ detect_anomaly: false
23
+ dist_backend: nccl
24
+ dist_init_method: env://
25
+ dist_launcher: null
26
+ dist_master_addr: localhost
27
+ dist_master_port: 46753
28
+ dist_rank: 0
29
+ dist_world_size: 4
30
+ distributed: true
31
+ dry_run: false
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ energy_extract: energy
37
+ energy_extract_conf:
38
+ fs: 22050
39
+ hop_length: 256
40
+ n_fft: 1024
41
+ reduction_factor: 1
42
+ win_length: 1024
43
+ energy_normalize: global_mvn
44
+ energy_normalize_conf:
45
+ stats_file: /home/speech/Fastspeech2_HS/tamil/male/model/energy_stats.npz
46
+ feats_extract: fbank
47
+ feats_extract_conf:
48
+ fmax: 8000
49
+ fmin: 0
50
+ fs: 22050
51
+ hop_length: 256
52
+ n_fft: 1024
53
+ n_mels: 80
54
+ win_length: 1024
55
+ fold_length:
56
+ - 150
57
+ - 204800
58
+ freeze_param: []
59
+ g2p: g2p_en_no_space
60
+ grad_clip: 1.0
61
+ grad_clip_type: 2.0
62
+ grad_noise: false
63
+ ignore_init_mismatch: false
64
+ init_param: []
65
+ iterator_type: sequence
66
+ keep_nbest_models: 5
67
+ local_rank: 0
68
+ log_interval: null
69
+ log_level: INFO
70
+ max_cache_fd: 32
71
+ max_cache_size: 0.0
72
+ max_epoch: 1000
73
+ model_conf: {}
74
+ multiple_iterator: false
75
+ multiprocessing_distributed: true
76
+ ngpu: 1
77
+ no_forward_run: false
78
+ non_linguistic_symbols: null
79
+ normalize: global_mvn
80
+ normalize_conf:
81
+ stats_file: /home/speech/Fastspeech2_HS/tamil/male/model/feats_stats.npz
82
+ num_att_plot: 3
83
+ num_cache_chunks: 1024
84
+ num_iters_per_epoch: 800
85
+ num_workers: 1
86
+ odim: null
87
+ optim: adam
88
+ optim_conf:
89
+ lr: 1.0
90
+ output_dir: exp/tts_train_fastspeech2_raw_char_None
91
+ patience: null
92
+ pitch_extract: dio
93
+ pitch_extract_conf:
94
+ f0max: 350
95
+ f0min: 40
96
+ fs: 22050
97
+ hop_length: 256
98
+ n_fft: 1024
99
+ reduction_factor: 1
100
+ pitch_normalize: global_mvn
101
+ pitch_normalize_conf:
102
+ stats_file: /home/speech/Fastspeech2_HS/tamil/male/model/pitch_stats.npz
103
+ pretrain_path: null
104
+ print_config: false
105
+ required:
106
+ - output_dir
107
+ - token_list
108
+ resume: true
109
+ scheduler: noamlr
110
+ scheduler_conf:
111
+ model_size: 384
112
+ warmup_steps: 4000
113
+ seed: 0
114
+ sharded_ddp: false
115
+ sort_batch: descending
116
+ sort_in_batch: descending
117
+ token_list:
118
+ - <blank>
119
+ - <unk>
120
+ - a
121
+ - i
122
+ - ','
123
+ - r
124
+ - n
125
+ - "\u091F"
126
+ - k
127
+ - m
128
+ - "\u0921"
129
+ - d
130
+ - w
131
+ - l
132
+ - A
133
+ - s
134
+ - E
135
+ - p
136
+ - u
137
+ - t
138
+ - "\u0910"
139
+ - g
140
+ - I
141
+ - y
142
+ - "\u0B89"
143
+ - $
144
+ - .
145
+ - U
146
+ - z
147
+ - h
148
+ - "\u0905"
149
+ - f
150
+ - b
151
+ - o
152
+ - "\u0928"
153
+ - "\u0911"
154
+ - "\u0930"
155
+ - "\u0D33"
156
+ - "\u0919"
157
+ - e
158
+ - "\u0923"
159
+ - O
160
+ - c
161
+ - j
162
+ - "\u0914"
163
+ - "\u0936"
164
+ - Z
165
+ - "\u0937"
166
+ - "\u091E"
167
+ - C
168
+ - "\u090D"
169
+ - <sos/eos>
170
+ token_type: char
171
+ train_data_path_and_name_and_type:
172
+ - - dump/raw/tr_no_dev/text
173
+ - text
174
+ - text
175
+ - - duration_info/tr_no_dev/durations
176
+ - durations
177
+ - text_int
178
+ - - dump/raw/tr_no_dev/wav.scp
179
+ - speech
180
+ - sound
181
+ - - exp/tts_stats_raw_char_None/train/collect_feats/pitch.scp
182
+ - pitch
183
+ - npy
184
+ - - exp/tts_stats_raw_char_None/train/collect_feats/energy.scp
185
+ - energy
186
+ - npy
187
+ train_dtype: float32
188
+ train_shape_file:
189
+ - exp/tts_stats_raw_char_None/train/text_shape.char
190
+ - exp/tts_stats_raw_char_None/train/speech_shape
191
+ tts: fastspeech2
192
+ tts_conf:
193
+ adim: 384
194
+ aheads: 2
195
+ decoder_normalize_before: true
196
+ dlayers: 4
197
+ dunits: 1536
198
+ duration_predictor_chans: 256
199
+ duration_predictor_kernel_size: 3
200
+ duration_predictor_layers: 2
201
+ elayers: 4
202
+ encoder_normalize_before: true
203
+ energy_embed_dropout: 0.0
204
+ energy_embed_kernel_size: 1
205
+ energy_predictor_chans: 256
206
+ energy_predictor_dropout: 0.5
207
+ energy_predictor_kernel_size: 3
208
+ energy_predictor_layers: 2
209
+ eunits: 1536
210
+ init_dec_alpha: 1.0
211
+ init_enc_alpha: 1.0
212
+ init_type: xavier_uniform
213
+ pitch_embed_dropout: 0.0
214
+ pitch_embed_kernel_size: 1
215
+ pitch_predictor_chans: 256
216
+ pitch_predictor_dropout: 0.5
217
+ pitch_predictor_kernel_size: 5
218
+ pitch_predictor_layers: 5
219
+ positionwise_conv_kernel_size: 3
220
+ positionwise_layer_type: conv1d
221
+ postnet_chans: 256
222
+ postnet_filts: 5
223
+ postnet_layers: 5
224
+ reduction_factor: 1
225
+ stop_gradient_from_energy_predictor: false
226
+ stop_gradient_from_pitch_predictor: true
227
+ transformer_dec_attn_dropout_rate: 0.2
228
+ transformer_dec_dropout_rate: 0.2
229
+ transformer_dec_positional_dropout_rate: 0.2
230
+ transformer_enc_attn_dropout_rate: 0.2
231
+ transformer_enc_dropout_rate: 0.2
232
+ transformer_enc_positional_dropout_rate: 0.2
233
+ use_masking: true
234
+ use_scaled_pos_enc: true
235
+ unused_parameters: false
236
+ use_amp: false
237
+ use_preprocessor: true
238
+ use_tensorboard: true
239
+ use_wandb: false
240
+ val_scheduler_criterion:
241
+ - valid
242
+ - loss
243
+ valid_batch_bins: null
244
+ valid_batch_size: null
245
+ valid_batch_type: null
246
+ valid_data_path_and_name_and_type:
247
+ - - dump/raw/dev/text
248
+ - text
249
+ - text
250
+ - - duration_info/dev/durations
251
+ - durations
252
+ - text_int
253
+ - - dump/raw/dev/wav.scp
254
+ - speech
255
+ - sound
256
+ - - exp/tts_stats_raw_char_None/valid/collect_feats/pitch.scp
257
+ - pitch
258
+ - npy
259
+ - - exp/tts_stats_raw_char_None/valid/collect_feats/energy.scp
260
+ - energy
261
+ - npy
262
+ valid_max_cache_size: null
263
+ valid_shape_file:
264
+ - exp/tts_stats_raw_char_None/valid/text_shape.char
265
+ - exp/tts_stats_raw_char_None/valid/speech_shape
266
+ version: 0.10.3a3
267
+ wandb_entity: null
268
+ wandb_id: null
269
+ wandb_model_log_interval: -1
270
+ wandb_name: null
271
+ wandb_project: null
272
+ write_collected_feats: false