fujie commited on
Commit
8d9e7ec
·
1 Parent(s): b9a91df

Update model

Browse files
Files changed (26) hide show
  1. README.md +439 -0
  2. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/config.yaml +361 -0
  3. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_backward_time.png +0 -0
  4. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_fake_loss.png +0 -0
  5. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_forward_time.png +0 -0
  6. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_loss.png +0 -0
  7. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_optim_step_time.png +0 -0
  8. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_real_loss.png +0 -0
  9. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_train_time.png +0 -0
  10. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_adv_loss.png +0 -0
  11. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_backward_time.png +0 -0
  12. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_dur_loss.png +0 -0
  13. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_feat_match_loss.png +0 -0
  14. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_forward_time.png +0 -0
  15. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_kl_loss.png +0 -0
  16. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_loss.png +0 -0
  17. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_mel_loss.png +0 -0
  18. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_optim_step_time.png +0 -0
  19. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_train_time.png +0 -0
  20. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/gpu_max_cached_mem_GB.png +0 -0
  21. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/iter_time.png +0 -0
  22. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/optim0_lr0.png +0 -0
  23. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/optim1_lr0.png +0 -0
  24. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/train_time.png +0 -0
  25. exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/train.total_count.ave_10best.pth +3 -0
  26. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,442 @@
1
  ---
 
 
 
 
 
 
 
2
  license: cc-by-4.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: jp
7
+ datasets:
8
+ - studies
9
  license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### `fujie/fujie_studies_tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody`
15
+
16
+ This model was trained by Shinya Fujie using studies recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 2219358fbd064d79214b12540afd498feaf49596
26
+ pip install -e .
27
+ cd egs2/studies/tts1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model fujie/fujie_studies_tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody
29
+ ```
30
+
31
+
32
+
33
+ ## TTS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: ./conf/tuning/finetune_vits.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ dry_run: false
42
+ iterator_type: sequence
43
+ output_dir: exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody
44
+ ngpu: 1
45
+ seed: 777
46
+ num_workers: 4
47
+ num_att_plot: 3
48
+ dist_backend: nccl
49
+ dist_init_method: env://
50
+ dist_world_size: 2
51
+ dist_rank: 0
52
+ local_rank: 0
53
+ dist_master_addr: localhost
54
+ dist_master_port: 57369
55
+ dist_launcher: null
56
+ multiprocessing_distributed: true
57
+ unused_parameters: true
58
+ sharded_ddp: false
59
+ cudnn_enabled: true
60
+ cudnn_benchmark: false
61
+ cudnn_deterministic: false
62
+ collect_stats: false
63
+ write_collected_feats: false
64
+ max_epoch: 100
65
+ patience: null
66
+ val_scheduler_criterion:
67
+ - valid
68
+ - loss
69
+ early_stopping_criterion:
70
+ - valid
71
+ - loss
72
+ - min
73
+ best_model_criterion:
74
+ - - train
75
+ - total_count
76
+ - max
77
+ keep_nbest_models: 10
78
+ nbest_averaging_interval: 0
79
+ grad_clip: -1
80
+ grad_clip_type: 2.0
81
+ grad_noise: false
82
+ accum_grad: 1
83
+ no_forward_run: false
84
+ resume: true
85
+ train_dtype: float32
86
+ use_amp: false
87
+ log_interval: 50
88
+ use_matplotlib: true
89
+ use_tensorboard: true
90
+ create_graph_in_tensorboard: false
91
+ use_wandb: false
92
+ wandb_project: null
93
+ wandb_id: null
94
+ wandb_entity: null
95
+ wandb_name: null
96
+ wandb_model_log_interval: -1
97
+ detect_anomaly: false
98
+ pretrain_path: null
99
+ init_param:
100
+ - downloads/models--espnet--kan-bayashi_jsut_vits_prosody/snapshots/3a859bfd2c9710846fa6244598000f0578a2d3e4/exp/tts_train_vits_raw_phn_jaconv_pyopenjtalk_prosody/train.total_count.ave_10best.pth
101
+ ignore_init_mismatch: false
102
+ freeze_param: []
103
+ num_iters_per_epoch: 1000
104
+ batch_size: 20
105
+ valid_batch_size: null
106
+ batch_bins: 1000000
107
+ valid_batch_bins: null
108
+ train_shape_file:
109
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_prosody/train/text_shape.phn
110
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_prosody/train/speech_shape
111
+ valid_shape_file:
112
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_prosody/valid/text_shape.phn
113
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_prosody/valid/speech_shape
114
+ batch_type: numel
115
+ valid_batch_type: null
116
+ fold_length:
117
+ - 150
118
+ - 204800
119
+ sort_in_batch: descending
120
+ sort_batch: descending
121
+ multiple_iterator: false
122
+ chunk_length: 500
123
+ chunk_shift_ratio: 0.5
124
+ num_cache_chunks: 1024
125
+ chunk_excluded_key_prefixes: []
126
+ train_data_path_and_name_and_type:
127
+ - - dump/22k/raw/ITA_tr_no_dev/text
128
+ - text
129
+ - text
130
+ - - dump/22k/raw/ITA_tr_no_dev/wav.scp
131
+ - speech
132
+ - sound
133
+ valid_data_path_and_name_and_type:
134
+ - - dump/22k/raw/ITA_dev/text
135
+ - text
136
+ - text
137
+ - - dump/22k/raw/ITA_dev/wav.scp
138
+ - speech
139
+ - sound
140
+ allow_variable_data_keys: false
141
+ max_cache_size: 0.0
142
+ max_cache_fd: 32
143
+ valid_max_cache_size: null
144
+ exclude_weight_decay: false
145
+ exclude_weight_decay_conf: {}
146
+ optim: adamw
147
+ optim_conf:
148
+ lr: 0.0001
149
+ betas:
150
+ - 0.8
151
+ - 0.99
152
+ eps: 1.0e-09
153
+ weight_decay: 0.0
154
+ scheduler: exponentiallr
155
+ scheduler_conf:
156
+ gamma: 0.999875
157
+ optim2: adamw
158
+ optim2_conf:
159
+ lr: 0.0001
160
+ betas:
161
+ - 0.8
162
+ - 0.99
163
+ eps: 1.0e-09
164
+ weight_decay: 0.0
165
+ scheduler2: exponentiallr
166
+ scheduler2_conf:
167
+ gamma: 0.999875
168
+ generator_first: false
169
+ token_list:
170
+ - <blank>
171
+ - <unk>
172
+ - a
173
+ - o
174
+ - i
175
+ - '['
176
+ - '#'
177
+ - u
178
+ - ']'
179
+ - e
180
+ - k
181
+ - n
182
+ - t
183
+ - r
184
+ - s
185
+ - N
186
+ - m
187
+ - _
188
+ - sh
189
+ - d
190
+ - g
191
+ - ^
192
+ - $
193
+ - w
194
+ - cl
195
+ - h
196
+ - y
197
+ - b
198
+ - j
199
+ - ts
200
+ - ch
201
+ - z
202
+ - p
203
+ - f
204
+ - ky
205
+ - ry
206
+ - gy
207
+ - hy
208
+ - ny
209
+ - by
210
+ - my
211
+ - py
212
+ - v
213
+ - dy
214
+ - '?'
215
+ - ty
216
+ - <sos/eos>
217
+ odim: null
218
+ model_conf: {}
219
+ use_preprocessor: true
220
+ token_type: phn
221
+ bpemodel: null
222
+ non_linguistic_symbols: null
223
+ cleaner: jaconv
224
+ g2p: pyopenjtalk_prosody
225
+ feats_extract: linear_spectrogram
226
+ feats_extract_conf:
227
+ n_fft: 1024
228
+ hop_length: 256
229
+ win_length: null
230
+ normalize: null
231
+ normalize_conf: {}
232
+ tts: vits
233
+ tts_conf:
234
+ generator_type: vits_generator
235
+ generator_params:
236
+ hidden_channels: 192
237
+ spks: -1
238
+ global_channels: -1
239
+ segment_size: 32
240
+ text_encoder_attention_heads: 2
241
+ text_encoder_ffn_expand: 4
242
+ text_encoder_blocks: 6
243
+ text_encoder_positionwise_layer_type: conv1d
244
+ text_encoder_positionwise_conv_kernel_size: 3
245
+ text_encoder_positional_encoding_layer_type: rel_pos
246
+ text_encoder_self_attention_layer_type: rel_selfattn
247
+ text_encoder_activation_type: swish
248
+ text_encoder_normalize_before: true
249
+ text_encoder_dropout_rate: 0.1
250
+ text_encoder_positional_dropout_rate: 0.0
251
+ text_encoder_attention_dropout_rate: 0.1
252
+ use_macaron_style_in_text_encoder: true
253
+ use_conformer_conv_in_text_encoder: false
254
+ text_encoder_conformer_kernel_size: -1
255
+ decoder_kernel_size: 7
256
+ decoder_channels: 512
257
+ decoder_upsample_scales:
258
+ - 8
259
+ - 8
260
+ - 2
261
+ - 2
262
+ decoder_upsample_kernel_sizes:
263
+ - 16
264
+ - 16
265
+ - 4
266
+ - 4
267
+ decoder_resblock_kernel_sizes:
268
+ - 3
269
+ - 7
270
+ - 11
271
+ decoder_resblock_dilations:
272
+ - - 1
273
+ - 3
274
+ - 5
275
+ - - 1
276
+ - 3
277
+ - 5
278
+ - - 1
279
+ - 3
280
+ - 5
281
+ use_weight_norm_in_decoder: true
282
+ posterior_encoder_kernel_size: 5
283
+ posterior_encoder_layers: 16
284
+ posterior_encoder_stacks: 1
285
+ posterior_encoder_base_dilation: 1
286
+ posterior_encoder_dropout_rate: 0.0
287
+ use_weight_norm_in_posterior_encoder: true
288
+ flow_flows: 4
289
+ flow_kernel_size: 5
290
+ flow_base_dilation: 1
291
+ flow_layers: 4
292
+ flow_dropout_rate: 0.0
293
+ use_weight_norm_in_flow: true
294
+ use_only_mean_in_flow: true
295
+ stochastic_duration_predictor_kernel_size: 3
296
+ stochastic_duration_predictor_dropout_rate: 0.5
297
+ stochastic_duration_predictor_flows: 4
298
+ stochastic_duration_predictor_dds_conv_layers: 3
299
+ vocabs: 47
300
+ aux_channels: 513
301
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
302
+ discriminator_params:
303
+ scales: 1
304
+ scale_downsample_pooling: AvgPool1d
305
+ scale_downsample_pooling_params:
306
+ kernel_size: 4
307
+ stride: 2
308
+ padding: 2
309
+ scale_discriminator_params:
310
+ in_channels: 1
311
+ out_channels: 1
312
+ kernel_sizes:
313
+ - 15
314
+ - 41
315
+ - 5
316
+ - 3
317
+ channels: 128
318
+ max_downsample_channels: 1024
319
+ max_groups: 16
320
+ bias: true
321
+ downsample_scales:
322
+ - 2
323
+ - 2
324
+ - 4
325
+ - 4
326
+ - 1
327
+ nonlinear_activation: LeakyReLU
328
+ nonlinear_activation_params:
329
+ negative_slope: 0.1
330
+ use_weight_norm: true
331
+ use_spectral_norm: false
332
+ follow_official_norm: false
333
+ periods:
334
+ - 2
335
+ - 3
336
+ - 5
337
+ - 7
338
+ - 11
339
+ period_discriminator_params:
340
+ in_channels: 1
341
+ out_channels: 1
342
+ kernel_sizes:
343
+ - 5
344
+ - 3
345
+ channels: 32
346
+ downsample_scales:
347
+ - 3
348
+ - 3
349
+ - 3
350
+ - 3
351
+ - 1
352
+ max_downsample_channels: 1024
353
+ bias: true
354
+ nonlinear_activation: LeakyReLU
355
+ nonlinear_activation_params:
356
+ negative_slope: 0.1
357
+ use_weight_norm: true
358
+ use_spectral_norm: false
359
+ generator_adv_loss_params:
360
+ average_by_discriminators: false
361
+ loss_type: mse
362
+ discriminator_adv_loss_params:
363
+ average_by_discriminators: false
364
+ loss_type: mse
365
+ feat_match_loss_params:
366
+ average_by_discriminators: false
367
+ average_by_layers: false
368
+ include_final_outputs: true
369
+ mel_loss_params:
370
+ fs: 22050
371
+ n_fft: 1024
372
+ hop_length: 256
373
+ win_length: null
374
+ window: hann
375
+ n_mels: 80
376
+ fmin: 0
377
+ fmax: null
378
+ log_base: null
379
+ lambda_adv: 1.0
380
+ lambda_mel: 45.0
381
+ lambda_feat_match: 2.0
382
+ lambda_dur: 1.0
383
+ lambda_kl: 1.0
384
+ sampling_rate: 22050
385
+ cache_generator_outputs: true
386
+ pitch_extract: null
387
+ pitch_extract_conf: {}
388
+ pitch_normalize: null
389
+ pitch_normalize_conf: {}
390
+ energy_extract: null
391
+ energy_extract_conf: {}
392
+ energy_normalize: null
393
+ energy_normalize_conf: {}
394
+ required:
395
+ - output_dir
396
+ - token_list
397
+ version: '202304'
398
+ distributed: true
399
+ ```
400
+
401
+ </details>
402
+
403
+
404
+
405
+ ### Citing ESPnet
406
+
407
+ ```BibTex
408
+ @inproceedings{watanabe2018espnet,
409
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
410
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
411
+ year={2018},
412
+ booktitle={Proceedings of Interspeech},
413
+ pages={2207--2211},
414
+ doi={10.21437/Interspeech.2018-1456},
415
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
416
+ }
417
+
418
+
419
+
420
+
421
+ @inproceedings{hayashi2020espnet,
422
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
423
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
424
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
425
+ pages={7654--7658},
426
+ year={2020},
427
+ organization={IEEE}
428
+ }
429
+ ```
430
+
431
+ or arXiv:
432
+
433
+ ```bibtex
434
+ @misc{watanabe2018espnet,
435
+ title={ESPnet: End-to-End Speech Processing Toolkit},
436
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
437
+ year={2018},
438
+ eprint={1804.00015},
439
+ archivePrefix={arXiv},
440
+ primaryClass={cs.CL}
441
+ }
442
+ ```
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/config.yaml ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: ./conf/tuning/finetune_vits.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody
7
+ ngpu: 1
8
+ seed: 777
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 2
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 57369
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: false
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 100
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - train
38
+ - total_count
39
+ - max
40
+ keep_nbest_models: 10
41
+ nbest_averaging_interval: 0
42
+ grad_clip: -1
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 1
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: 50
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ create_graph_in_tensorboard: false
54
+ use_wandb: false
55
+ wandb_project: null
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param:
63
+ - downloads/models--espnet--kan-bayashi_jsut_vits_prosody/snapshots/3a859bfd2c9710846fa6244598000f0578a2d3e4/exp/tts_train_vits_raw_phn_jaconv_pyopenjtalk_prosody/train.total_count.ave_10best.pth
64
+ ignore_init_mismatch: false
65
+ freeze_param: []
66
+ num_iters_per_epoch: 1000
67
+ batch_size: 20
68
+ valid_batch_size: null
69
+ batch_bins: 1000000
70
+ valid_batch_bins: null
71
+ train_shape_file:
72
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_prosody/train/text_shape.phn
73
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_prosody/train/speech_shape
74
+ valid_shape_file:
75
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_prosody/valid/text_shape.phn
76
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_prosody/valid/speech_shape
77
+ batch_type: numel
78
+ valid_batch_type: null
79
+ fold_length:
80
+ - 150
81
+ - 204800
82
+ sort_in_batch: descending
83
+ sort_batch: descending
84
+ multiple_iterator: false
85
+ chunk_length: 500
86
+ chunk_shift_ratio: 0.5
87
+ num_cache_chunks: 1024
88
+ chunk_excluded_key_prefixes: []
89
+ train_data_path_and_name_and_type:
90
+ - - dump/22k/raw/ITA_tr_no_dev/text
91
+ - text
92
+ - text
93
+ - - dump/22k/raw/ITA_tr_no_dev/wav.scp
94
+ - speech
95
+ - sound
96
+ valid_data_path_and_name_and_type:
97
+ - - dump/22k/raw/ITA_dev/text
98
+ - text
99
+ - text
100
+ - - dump/22k/raw/ITA_dev/wav.scp
101
+ - speech
102
+ - sound
103
+ allow_variable_data_keys: false
104
+ max_cache_size: 0.0
105
+ max_cache_fd: 32
106
+ valid_max_cache_size: null
107
+ exclude_weight_decay: false
108
+ exclude_weight_decay_conf: {}
109
+ optim: adamw
110
+ optim_conf:
111
+ lr: 0.0001
112
+ betas:
113
+ - 0.8
114
+ - 0.99
115
+ eps: 1.0e-09
116
+ weight_decay: 0.0
117
+ scheduler: exponentiallr
118
+ scheduler_conf:
119
+ gamma: 0.999875
120
+ optim2: adamw
121
+ optim2_conf:
122
+ lr: 0.0001
123
+ betas:
124
+ - 0.8
125
+ - 0.99
126
+ eps: 1.0e-09
127
+ weight_decay: 0.0
128
+ scheduler2: exponentiallr
129
+ scheduler2_conf:
130
+ gamma: 0.999875
131
+ generator_first: false
132
+ token_list:
133
+ - <blank>
134
+ - <unk>
135
+ - a
136
+ - o
137
+ - i
138
+ - '['
139
+ - '#'
140
+ - u
141
+ - ']'
142
+ - e
143
+ - k
144
+ - n
145
+ - t
146
+ - r
147
+ - s
148
+ - N
149
+ - m
150
+ - _
151
+ - sh
152
+ - d
153
+ - g
154
+ - ^
155
+ - $
156
+ - w
157
+ - cl
158
+ - h
159
+ - y
160
+ - b
161
+ - j
162
+ - ts
163
+ - ch
164
+ - z
165
+ - p
166
+ - f
167
+ - ky
168
+ - ry
169
+ - gy
170
+ - hy
171
+ - ny
172
+ - by
173
+ - my
174
+ - py
175
+ - v
176
+ - dy
177
+ - '?'
178
+ - ty
179
+ - <sos/eos>
180
+ odim: null
181
+ model_conf: {}
182
+ use_preprocessor: true
183
+ token_type: phn
184
+ bpemodel: null
185
+ non_linguistic_symbols: null
186
+ cleaner: jaconv
187
+ g2p: pyopenjtalk_prosody
188
+ feats_extract: linear_spectrogram
189
+ feats_extract_conf:
190
+ n_fft: 1024
191
+ hop_length: 256
192
+ win_length: null
193
+ normalize: null
194
+ normalize_conf: {}
195
+ tts: vits
196
+ tts_conf:
197
+ generator_type: vits_generator
198
+ generator_params:
199
+ hidden_channels: 192
200
+ spks: -1
201
+ global_channels: -1
202
+ segment_size: 32
203
+ text_encoder_attention_heads: 2
204
+ text_encoder_ffn_expand: 4
205
+ text_encoder_blocks: 6
206
+ text_encoder_positionwise_layer_type: conv1d
207
+ text_encoder_positionwise_conv_kernel_size: 3
208
+ text_encoder_positional_encoding_layer_type: rel_pos
209
+ text_encoder_self_attention_layer_type: rel_selfattn
210
+ text_encoder_activation_type: swish
211
+ text_encoder_normalize_before: true
212
+ text_encoder_dropout_rate: 0.1
213
+ text_encoder_positional_dropout_rate: 0.0
214
+ text_encoder_attention_dropout_rate: 0.1
215
+ use_macaron_style_in_text_encoder: true
216
+ use_conformer_conv_in_text_encoder: false
217
+ text_encoder_conformer_kernel_size: -1
218
+ decoder_kernel_size: 7
219
+ decoder_channels: 512
220
+ decoder_upsample_scales:
221
+ - 8
222
+ - 8
223
+ - 2
224
+ - 2
225
+ decoder_upsample_kernel_sizes:
226
+ - 16
227
+ - 16
228
+ - 4
229
+ - 4
230
+ decoder_resblock_kernel_sizes:
231
+ - 3
232
+ - 7
233
+ - 11
234
+ decoder_resblock_dilations:
235
+ - - 1
236
+ - 3
237
+ - 5
238
+ - - 1
239
+ - 3
240
+ - 5
241
+ - - 1
242
+ - 3
243
+ - 5
244
+ use_weight_norm_in_decoder: true
245
+ posterior_encoder_kernel_size: 5
246
+ posterior_encoder_layers: 16
247
+ posterior_encoder_stacks: 1
248
+ posterior_encoder_base_dilation: 1
249
+ posterior_encoder_dropout_rate: 0.0
250
+ use_weight_norm_in_posterior_encoder: true
251
+ flow_flows: 4
252
+ flow_kernel_size: 5
253
+ flow_base_dilation: 1
254
+ flow_layers: 4
255
+ flow_dropout_rate: 0.0
256
+ use_weight_norm_in_flow: true
257
+ use_only_mean_in_flow: true
258
+ stochastic_duration_predictor_kernel_size: 3
259
+ stochastic_duration_predictor_dropout_rate: 0.5
260
+ stochastic_duration_predictor_flows: 4
261
+ stochastic_duration_predictor_dds_conv_layers: 3
262
+ vocabs: 47
263
+ aux_channels: 513
264
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
265
+ discriminator_params:
266
+ scales: 1
267
+ scale_downsample_pooling: AvgPool1d
268
+ scale_downsample_pooling_params:
269
+ kernel_size: 4
270
+ stride: 2
271
+ padding: 2
272
+ scale_discriminator_params:
273
+ in_channels: 1
274
+ out_channels: 1
275
+ kernel_sizes:
276
+ - 15
277
+ - 41
278
+ - 5
279
+ - 3
280
+ channels: 128
281
+ max_downsample_channels: 1024
282
+ max_groups: 16
283
+ bias: true
284
+ downsample_scales:
285
+ - 2
286
+ - 2
287
+ - 4
288
+ - 4
289
+ - 1
290
+ nonlinear_activation: LeakyReLU
291
+ nonlinear_activation_params:
292
+ negative_slope: 0.1
293
+ use_weight_norm: true
294
+ use_spectral_norm: false
295
+ follow_official_norm: false
296
+ periods:
297
+ - 2
298
+ - 3
299
+ - 5
300
+ - 7
301
+ - 11
302
+ period_discriminator_params:
303
+ in_channels: 1
304
+ out_channels: 1
305
+ kernel_sizes:
306
+ - 5
307
+ - 3
308
+ channels: 32
309
+ downsample_scales:
310
+ - 3
311
+ - 3
312
+ - 3
313
+ - 3
314
+ - 1
315
+ max_downsample_channels: 1024
316
+ bias: true
317
+ nonlinear_activation: LeakyReLU
318
+ nonlinear_activation_params:
319
+ negative_slope: 0.1
320
+ use_weight_norm: true
321
+ use_spectral_norm: false
322
+ generator_adv_loss_params:
323
+ average_by_discriminators: false
324
+ loss_type: mse
325
+ discriminator_adv_loss_params:
326
+ average_by_discriminators: false
327
+ loss_type: mse
328
+ feat_match_loss_params:
329
+ average_by_discriminators: false
330
+ average_by_layers: false
331
+ include_final_outputs: true
332
+ mel_loss_params:
333
+ fs: 22050
334
+ n_fft: 1024
335
+ hop_length: 256
336
+ win_length: null
337
+ window: hann
338
+ n_mels: 80
339
+ fmin: 0
340
+ fmax: null
341
+ log_base: null
342
+ lambda_adv: 1.0
343
+ lambda_mel: 45.0
344
+ lambda_feat_match: 2.0
345
+ lambda_dur: 1.0
346
+ lambda_kl: 1.0
347
+ sampling_rate: 22050
348
+ cache_generator_outputs: true
349
+ pitch_extract: null
350
+ pitch_extract_conf: {}
351
+ pitch_normalize: null
352
+ pitch_normalize_conf: {}
353
+ energy_extract: null
354
+ energy_extract_conf: {}
355
+ energy_normalize: null
356
+ energy_normalize_conf: {}
357
+ required:
358
+ - output_dir
359
+ - token_list
360
+ version: '202304'
361
+ distributed: true
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_backward_time.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_fake_loss.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_forward_time.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_loss.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_optim_step_time.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_real_loss.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_train_time.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_adv_loss.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_backward_time.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_dur_loss.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_feat_match_loss.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_forward_time.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_kl_loss.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_loss.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_mel_loss.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_optim_step_time.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_train_time.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/iter_time.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/optim0_lr0.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/optim1_lr0.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/train_time.png ADDED
exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/train.total_count.ave_10best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6f3779d002e9faa88d506fe9c7fa41560830e8bdca027476e2cb0d4efe7d7c1
3
+ size 372534219
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202304'
2
+ files:
3
+ model_file: exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/train.total_count.ave_10best.pth
4
+ python: 3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]
5
+ timestamp: 1684334640.290265
6
+ torch: 1.13.1
7
+ yaml_files:
8
+ train_config: exp/tts_finetune_vits_raw_phn_jaconv_pyopenjtalk_prosody/config.yaml