yifengyu commited on
Commit
b111a4e
·
1 Parent(s): 9620414

Update model

Browse files
Files changed (31) hide show
  1. README.md +551 -3
  2. dump/24k/raw/org/tr_no_dev/spk2sid +31 -0
  3. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/200epoch.pth +3 -0
  4. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/config.yaml +468 -0
  5. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/discriminator_backward_time.png +0 -0
  6. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/discriminator_fake_loss.png +0 -0
  7. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/discriminator_forward_time.png +0 -0
  8. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/discriminator_loss.png +0 -0
  9. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/discriminator_optim_step_time.png +0 -0
  10. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/discriminator_real_loss.png +0 -0
  11. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/discriminator_train_time.png +0 -0
  12. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_adv_loss.png +0 -0
  13. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_backward_time.png +0 -0
  14. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_feat_match_loss.png +0 -0
  15. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_forward_time.png +0 -0
  16. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_kl_loss.png +0 -0
  17. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_loss.png +0 -0
  18. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_mel_am_loss.png +0 -0
  19. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_mel_ddsp_loss.png +0 -0
  20. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_mel_loss.png +0 -0
  21. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_optim_step_time.png +0 -0
  22. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_phn_dur_loss.png +0 -0
  23. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_pitch_loss.png +0 -0
  24. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_score_dur_loss.png +0 -0
  25. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_train_time.png +0 -0
  26. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/gpu_max_cached_mem_GB.png +0 -0
  27. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/iter_time.png +0 -0
  28. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/optim0_lr0.png +0 -0
  29. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/optim1_lr0.png +0 -0
  30. exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/train_time.png +0 -0
  31. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,551 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - singing-voice-synthesis
6
+ language: zh
7
+ datasets:
8
+ - acesinger
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 SVS model
13
+
14
+ ### `yifengyu/svs_train_visinger2plus_mert_raw_phn_None_zh_200epoch_acesinger`
15
+
16
+ This model was trained by jerryuhoo using acesinger recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 4c55d6c9071fb36addcc8426f2befd8f9a1bd11e
26
+ pip install -e .
27
+ cd egs2/acesinger/svs1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model yifengyu/svs_train_visinger2plus_mert_raw_phn_None_zh_200epoch_acesinger
29
+ ```
30
+
31
+
32
+
33
+ ## SVS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: ./conf/tuning/train_visinger_mert.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ drop_last_iter: false
42
+ dry_run: false
43
+ iterator_type: sequence
44
+ valid_iterator_type: null
45
+ output_dir: exp/24k/svs_train_visinger_mert_raw_phn_None_zh
46
+ ngpu: 1
47
+ seed: 777
48
+ num_workers: 4
49
+ num_att_plot: 0
50
+ dist_backend: nccl
51
+ dist_init_method: env://
52
+ dist_world_size: null
53
+ dist_rank: null
54
+ local_rank: 0
55
+ dist_master_addr: null
56
+ dist_master_port: null
57
+ dist_launcher: null
58
+ multiprocessing_distributed: false
59
+ unused_parameters: true
60
+ sharded_ddp: false
61
+ cudnn_enabled: true
62
+ cudnn_benchmark: false
63
+ cudnn_deterministic: false
64
+ collect_stats: false
65
+ write_collected_feats: false
66
+ max_epoch: 200
67
+ patience: null
68
+ val_scheduler_criterion:
69
+ - valid
70
+ - loss
71
+ early_stopping_criterion:
72
+ - valid
73
+ - loss
74
+ - min
75
+ best_model_criterion:
76
+ - - train
77
+ - total_count
78
+ - max
79
+ keep_nbest_models: 10
80
+ nbest_averaging_interval: 0
81
+ grad_clip: -1
82
+ grad_clip_type: 2.0
83
+ grad_noise: false
84
+ accum_grad: 1
85
+ no_forward_run: false
86
+ resume: true
87
+ train_dtype: float32
88
+ use_amp: false
89
+ log_interval: 50
90
+ use_matplotlib: true
91
+ use_tensorboard: true
92
+ create_graph_in_tensorboard: false
93
+ use_wandb: false
94
+ wandb_project: null
95
+ wandb_id: null
96
+ wandb_entity: null
97
+ wandb_name: null
98
+ wandb_model_log_interval: -1
99
+ detect_anomaly: false
100
+ use_lora: false
101
+ save_lora_only: true
102
+ lora_conf: {}
103
+ pretrain_path: null
104
+ init_param: []
105
+ ignore_init_mismatch: false
106
+ freeze_param: []
107
+ num_iters_per_epoch: 1000
108
+ batch_size: 4
109
+ valid_batch_size: null
110
+ batch_bins: 1000000
111
+ valid_batch_bins: null
112
+ train_shape_file:
113
+ - exp/24k/svs_stats_raw_phn_None_zh/train/text_shape.phn
114
+ - exp/24k/svs_stats_raw_phn_None_zh/train/singing_shape
115
+ valid_shape_file:
116
+ - exp/24k/svs_stats_raw_phn_None_zh/valid/text_shape.phn
117
+ - exp/24k/svs_stats_raw_phn_None_zh/valid/singing_shape
118
+ batch_type: sorted
119
+ valid_batch_type: null
120
+ fold_length:
121
+ - 150
122
+ - 384000
123
+ sort_in_batch: descending
124
+ shuffle_within_batch: false
125
+ sort_batch: descending
126
+ multiple_iterator: false
127
+ chunk_length: 500
128
+ chunk_shift_ratio: 0.5
129
+ num_cache_chunks: 1024
130
+ chunk_excluded_key_prefixes: []
131
+ chunk_default_fs: null
132
+ train_data_path_and_name_and_type:
133
+ - - dump/24k/raw/tr_no_dev/text
134
+ - text
135
+ - text
136
+ - - dump/24k/raw/tr_no_dev/wav.scp
137
+ - singing
138
+ - sound
139
+ - - dump/24k/raw/tr_no_dev/label
140
+ - label
141
+ - duration
142
+ - - dump/24k/raw/tr_no_dev/score.scp
143
+ - score
144
+ - score
145
+ - - dump/24k/raw/tr_no_dev/utt2sid
146
+ - sids
147
+ - text_int
148
+ valid_data_path_and_name_and_type:
149
+ - - dump/24k/raw/dev/text
150
+ - text
151
+ - text
152
+ - - dump/24k/raw/dev/wav.scp
153
+ - singing
154
+ - sound
155
+ - - dump/24k/raw/dev/label
156
+ - label
157
+ - duration
158
+ - - dump/24k/raw/dev/score.scp
159
+ - score
160
+ - score
161
+ - - dump/24k/raw/dev/utt2sid
162
+ - sids
163
+ - text_int
164
+ allow_variable_data_keys: false
165
+ max_cache_size: 0.0
166
+ max_cache_fd: 32
167
+ allow_multi_rates: false
168
+ valid_max_cache_size: null
169
+ exclude_weight_decay: false
170
+ exclude_weight_decay_conf: {}
171
+ optim: adamw
172
+ optim_conf:
173
+ lr: 0.0002
174
+ betas:
175
+ - 0.8
176
+ - 0.99
177
+ eps: 1.0e-09
178
+ weight_decay: 0.0
179
+ scheduler: exponentiallr
180
+ scheduler_conf:
181
+ gamma: 0.998
182
+ optim2: adamw
183
+ optim2_conf:
184
+ lr: 0.0002
185
+ betas:
186
+ - 0.8
187
+ - 0.99
188
+ eps: 1.0e-09
189
+ weight_decay: 0.0
190
+ scheduler2: exponentiallr
191
+ scheduler2_conf:
192
+ gamma: 0.998
193
+ generator_first: true
194
+ input_size: null
195
+ token_list:
196
+ - <blank>
197
+ - <unk>
198
+ - SP
199
+ - i
200
+ - AP
201
+ - e
202
+ - d
203
+ - y
204
+ - w
205
+ - sh
206
+ - ai
207
+ - n
208
+ - x
209
+ - j
210
+ - u
211
+ - ian
212
+ - l
213
+ - h
214
+ - b
215
+ - o
216
+ - zh
217
+ - ou
218
+ - an
219
+ - m
220
+ - q
221
+ - z
222
+ - en
223
+ - g
224
+ - ing
225
+ - ei
226
+ - ao
227
+ - uo
228
+ - ang
229
+ - eng
230
+ - t
231
+ - ong
232
+ - a
233
+ - ui
234
+ - f
235
+ - k
236
+ - r
237
+ - ch
238
+ - v
239
+ - iang
240
+ - in
241
+ - iao
242
+ - ie
243
+ - iu
244
+ - c
245
+ - s
246
+ - van
247
+ - p
248
+ - ve
249
+ - uan
250
+ - uang
251
+ - ia
252
+ - ua
253
+ - uai
254
+ - un
255
+ - er
256
+ - vn
257
+ - iong
258
+ - <sos/eos>
259
+ odim: null
260
+ model_conf: {}
261
+ use_preprocessor: true
262
+ token_type: phn
263
+ bpemodel: null
264
+ non_linguistic_symbols: null
265
+ cleaner: null
266
+ g2p: null
267
+ fs: 24000
268
+ frontend: s3prl
269
+ frontend_conf:
270
+ frontend_conf:
271
+ upstream: hf_mert_custom
272
+ path_or_url: m-a-p/MERT-v1-330M
273
+ download_dir: ./hub
274
+ multilayer_feature: true
275
+ score_feats_extract: syllable_score_feats
276
+ score_feats_extract_conf:
277
+ fs: 24000
278
+ n_fft: 2048
279
+ win_length: 2048
280
+ hop_length: 480
281
+ feats_extract: fbank
282
+ feats_extract_conf:
283
+ n_fft: 2048
284
+ hop_length: 480
285
+ win_length: 2048
286
+ fs: 24000
287
+ fmin: 80
288
+ fmax: 7600
289
+ n_mels: 80
290
+ normalize: null
291
+ normalize_conf: {}
292
+ svs: vits
293
+ svs_conf:
294
+ generator_type: visinger2
295
+ vocoder_generator_type: visinger2
296
+ generator_params:
297
+ hidden_channels: 192
298
+ spks: 31
299
+ global_channels: 256
300
+ segment_size: 20
301
+ text_encoder_attention_heads: 2
302
+ text_encoder_ffn_expand: 4
303
+ text_encoder_blocks: 6
304
+ text_encoder_positionwise_layer_type: conv1d
305
+ text_encoder_positionwise_conv_kernel_size: 3
306
+ text_encoder_positional_encoding_layer_type: rel_pos
307
+ text_encoder_self_attention_layer_type: rel_selfattn
308
+ text_encoder_activation_type: swish
309
+ text_encoder_normalize_before: true
310
+ text_encoder_dropout_rate: 0.1
311
+ text_encoder_positional_dropout_rate: 0.0
312
+ text_encoder_attention_dropout_rate: 0.1
313
+ use_macaron_style_in_text_encoder: true
314
+ use_conformer_conv_in_text_encoder: false
315
+ text_encoder_conformer_kernel_size: -1
316
+ decoder_kernel_size: 7
317
+ decoder_channels: 512
318
+ decoder_upsample_scales:
319
+ - 12
320
+ - 10
321
+ - 2
322
+ - 2
323
+ decoder_upsample_kernel_sizes:
324
+ - 24
325
+ - 20
326
+ - 4
327
+ - 4
328
+ decoder_resblock_kernel_sizes:
329
+ - 3
330
+ - 7
331
+ - 11
332
+ decoder_resblock_dilations:
333
+ - - 1
334
+ - 3
335
+ - 5
336
+ - - 1
337
+ - 3
338
+ - 5
339
+ - - 1
340
+ - 3
341
+ - 5
342
+ use_weight_norm_in_decoder: true
343
+ posterior_encoder_kernel_size: 3
344
+ posterior_encoder_layers: 8
345
+ posterior_encoder_stacks: 1
346
+ posterior_encoder_base_dilation: 1
347
+ posterior_encoder_dropout_rate: 0.0
348
+ use_weight_norm_in_posterior_encoder: true
349
+ flow_flows: -1
350
+ flow_kernel_size: 5
351
+ flow_base_dilation: 1
352
+ flow_layers: 4
353
+ flow_dropout_rate: 0.0
354
+ use_weight_norm_in_flow: true
355
+ use_only_mean_in_flow: true
356
+ use_phoneme_predictor: false
357
+ vocabs: 63
358
+ aux_channels: 80
359
+ generator_type: visinger2
360
+ vocoder_generator_type: visinger2
361
+ fs: 24000
362
+ hop_length: 480
363
+ win_length: 2048
364
+ n_fft: 2048
365
+ discriminator_type: visinger2
366
+ discriminator_params:
367
+ scales: 1
368
+ scale_downsample_pooling: AvgPool1d
369
+ scale_downsample_pooling_params:
370
+ kernel_size: 4
371
+ stride: 2
372
+ padding: 2
373
+ scale_discriminator_params:
374
+ in_channels: 1
375
+ out_channels: 1
376
+ kernel_sizes:
377
+ - 15
378
+ - 41
379
+ - 5
380
+ - 3
381
+ channels: 128
382
+ max_downsample_channels: 1024
383
+ max_groups: 256
384
+ bias: true
385
+ downsample_scales:
386
+ - 4
387
+ - 4
388
+ - 4
389
+ - 4
390
+ nonlinear_activation: LeakyReLU
391
+ nonlinear_activation_params:
392
+ negative_slope: 0.1
393
+ use_weight_norm: true
394
+ use_spectral_norm: false
395
+ follow_official_norm: false
396
+ periods:
397
+ - 2
398
+ - 3
399
+ - 5
400
+ - 7
401
+ - 11
402
+ period_discriminator_params:
403
+ in_channels: 1
404
+ out_channels: 1
405
+ kernel_sizes:
406
+ - 5
407
+ - 3
408
+ channels: 32
409
+ downsample_scales:
410
+ - 3
411
+ - 3
412
+ - 3
413
+ - 3
414
+ - 1
415
+ max_downsample_channels: 1024
416
+ bias: true
417
+ nonlinear_activation: LeakyReLU
418
+ nonlinear_activation_params:
419
+ negative_slope: 0.1
420
+ use_weight_norm: true
421
+ use_spectral_norm: false
422
+ multi_freq_disc_params:
423
+ hidden_channels:
424
+ - 256
425
+ - 256
426
+ - 256
427
+ - 256
428
+ - 256
429
+ domain: double
430
+ mel_scale: true
431
+ divisors:
432
+ - 32
433
+ - 16
434
+ - 8
435
+ - 4
436
+ - 2
437
+ - 1
438
+ - 1
439
+ strides:
440
+ - 1
441
+ - 2
442
+ - 1
443
+ - 2
444
+ - 1
445
+ - 2
446
+ - 1
447
+ sample_rate: 24000
448
+ hop_lengths:
449
+ - 60
450
+ - 120
451
+ - 180
452
+ - 240
453
+ - 300
454
+ - 360
455
+ generator_adv_loss_params:
456
+ average_by_discriminators: false
457
+ loss_type: mse
458
+ discriminator_adv_loss_params:
459
+ average_by_discriminators: false
460
+ loss_type: mse
461
+ feat_match_loss_params:
462
+ average_by_discriminators: false
463
+ average_by_layers: false
464
+ include_final_outputs: true
465
+ mel_loss_params:
466
+ fs: 24000
467
+ n_fft: 2048
468
+ hop_length: 480
469
+ win_length: 2048
470
+ window: hann
471
+ n_mels: 80
472
+ fmin: 0
473
+ fmax: 12000
474
+ log_base: null
475
+ lambda_adv: 1.0
476
+ lambda_mel: 45.0
477
+ lambda_feat_match: 2.0
478
+ lambda_dur: 0.1
479
+ lambda_pitch: 10.0
480
+ lambda_phoneme: 1.0
481
+ lambda_kl: 1.0
482
+ sampling_rate: 24000
483
+ cache_generator_outputs: true
484
+ pitch_extract: dio
485
+ pitch_extract_conf:
486
+ use_token_averaged_f0: false
487
+ use_log_f0: false
488
+ fs: 24000
489
+ n_fft: 2048
490
+ hop_length: 480
491
+ f0max: 800
492
+ f0min: 80
493
+ pitch_normalize: null
494
+ pitch_normalize_conf: {}
495
+ ying_extract: null
496
+ ying_extract_conf: {}
497
+ energy_extract: null
498
+ energy_extract_conf: {}
499
+ energy_normalize: null
500
+ energy_normalize_conf: {}
501
+ required:
502
+ - output_dir
503
+ - token_list
504
+ version: '202310'
505
+ distributed: false
506
+ ```
507
+
508
+ </details>
509
+
510
+
511
+
512
+ ### Citing ESPnet
513
+
514
+ ```BibTex
515
+ @inproceedings{watanabe2018espnet,
516
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
517
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
518
+ year={2018},
519
+ booktitle={Proceedings of Interspeech},
520
+ pages={2207--2211},
521
+ doi={10.21437/Interspeech.2018-1456},
522
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
523
+ }
524
+
525
+
526
+
527
+
528
+
529
+
530
+ @inproceedings{shi22d_interspeech,
531
+ author={Jiatong Shi and Shuai Guo and Tao Qian and Tomoki Hayashi and Yuning Wu and Fangzheng Xu and Xuankai Chang and Huazhe Li and Peter Wu and Shinji Watanabe and Qin Jin},
532
+ title={{Muskits: an End-to-end Music Processing Toolkit for Singing Voice Synthesis}},
533
+ year=2022,
534
+ booktitle={Proc. Interspeech 2022},
535
+ pages={4277--4281},
536
+ doi={10.21437/Interspeech.2022-10039}
537
+ }
538
+ ```
539
+
540
+ or arXiv:
541
+
542
+ ```bibtex
543
+ @misc{watanabe2018espnet,
544
+ title={ESPnet: End-to-End Speech Processing Toolkit},
545
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
546
+ year={2018},
547
+ eprint={1804.00015},
548
+ archivePrefix={arXiv},
549
+ primaryClass={cs.CL}
550
+ }
551
+ ```
dump/24k/raw/org/tr_no_dev/spk2sid ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <unk> 0
2
+ 1 1
3
+ 10 2
4
+ 11 3
5
+ 12 4
6
+ 13 5
7
+ 14 6
8
+ 15 7
9
+ 16 8
10
+ 17 9
11
+ 18 10
12
+ 19 11
13
+ 2 12
14
+ 20 13
15
+ 21 14
16
+ 22 15
17
+ 23 16
18
+ 24 17
19
+ 25 18
20
+ 26 19
21
+ 27 20
22
+ 28 21
23
+ 29 22
24
+ 3 23
25
+ 30 24
26
+ 4 25
27
+ 5 26
28
+ 6 27
29
+ 7 28
30
+ 8 29
31
+ 9 30
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/200epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6adfcb308a9afdf39a3fa18839aaa8bb3d765ae0970cf12803e0f32094109ba0
3
+ size 1749374811
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/config.yaml ADDED
@@ -0,0 +1,468 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: ./conf/tuning/train_visinger_mert.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/24k/svs_train_visinger_mert_raw_phn_None_zh
9
+ ngpu: 1
10
+ seed: 777
11
+ num_workers: 4
12
+ num_att_plot: 0
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: false
27
+ collect_stats: false
28
+ write_collected_feats: false
29
+ max_epoch: 200
30
+ patience: null
31
+ val_scheduler_criterion:
32
+ - valid
33
+ - loss
34
+ early_stopping_criterion:
35
+ - valid
36
+ - loss
37
+ - min
38
+ best_model_criterion:
39
+ - - train
40
+ - total_count
41
+ - max
42
+ keep_nbest_models: 10
43
+ nbest_averaging_interval: 0
44
+ grad_clip: -1
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 1
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: 50
53
+ use_matplotlib: true
54
+ use_tensorboard: true
55
+ create_graph_in_tensorboard: false
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ use_lora: false
64
+ save_lora_only: true
65
+ lora_conf: {}
66
+ pretrain_path: null
67
+ init_param: []
68
+ ignore_init_mismatch: false
69
+ freeze_param: []
70
+ num_iters_per_epoch: 1000
71
+ batch_size: 4
72
+ valid_batch_size: null
73
+ batch_bins: 1000000
74
+ valid_batch_bins: null
75
+ train_shape_file:
76
+ - exp/24k/svs_stats_raw_phn_None_zh/train/text_shape.phn
77
+ - exp/24k/svs_stats_raw_phn_None_zh/train/singing_shape
78
+ valid_shape_file:
79
+ - exp/24k/svs_stats_raw_phn_None_zh/valid/text_shape.phn
80
+ - exp/24k/svs_stats_raw_phn_None_zh/valid/singing_shape
81
+ batch_type: sorted
82
+ valid_batch_type: null
83
+ fold_length:
84
+ - 150
85
+ - 384000
86
+ sort_in_batch: descending
87
+ shuffle_within_batch: false
88
+ sort_batch: descending
89
+ multiple_iterator: false
90
+ chunk_length: 500
91
+ chunk_shift_ratio: 0.5
92
+ num_cache_chunks: 1024
93
+ chunk_excluded_key_prefixes: []
94
+ chunk_default_fs: null
95
+ train_data_path_and_name_and_type:
96
+ - - dump/24k/raw/tr_no_dev/text
97
+ - text
98
+ - text
99
+ - - dump/24k/raw/tr_no_dev/wav.scp
100
+ - singing
101
+ - sound
102
+ - - dump/24k/raw/tr_no_dev/label
103
+ - label
104
+ - duration
105
+ - - dump/24k/raw/tr_no_dev/score.scp
106
+ - score
107
+ - score
108
+ - - dump/24k/raw/tr_no_dev/utt2sid
109
+ - sids
110
+ - text_int
111
+ valid_data_path_and_name_and_type:
112
+ - - dump/24k/raw/dev/text
113
+ - text
114
+ - text
115
+ - - dump/24k/raw/dev/wav.scp
116
+ - singing
117
+ - sound
118
+ - - dump/24k/raw/dev/label
119
+ - label
120
+ - duration
121
+ - - dump/24k/raw/dev/score.scp
122
+ - score
123
+ - score
124
+ - - dump/24k/raw/dev/utt2sid
125
+ - sids
126
+ - text_int
127
+ allow_variable_data_keys: false
128
+ max_cache_size: 0.0
129
+ max_cache_fd: 32
130
+ allow_multi_rates: false
131
+ valid_max_cache_size: null
132
+ exclude_weight_decay: false
133
+ exclude_weight_decay_conf: {}
134
+ optim: adamw
135
+ optim_conf:
136
+ lr: 0.0002
137
+ betas:
138
+ - 0.8
139
+ - 0.99
140
+ eps: 1.0e-09
141
+ weight_decay: 0.0
142
+ scheduler: exponentiallr
143
+ scheduler_conf:
144
+ gamma: 0.998
145
+ optim2: adamw
146
+ optim2_conf:
147
+ lr: 0.0002
148
+ betas:
149
+ - 0.8
150
+ - 0.99
151
+ eps: 1.0e-09
152
+ weight_decay: 0.0
153
+ scheduler2: exponentiallr
154
+ scheduler2_conf:
155
+ gamma: 0.998
156
+ generator_first: true
157
+ input_size: null
158
+ token_list:
159
+ - <blank>
160
+ - <unk>
161
+ - SP
162
+ - i
163
+ - AP
164
+ - e
165
+ - d
166
+ - y
167
+ - w
168
+ - sh
169
+ - ai
170
+ - n
171
+ - x
172
+ - j
173
+ - u
174
+ - ian
175
+ - l
176
+ - h
177
+ - b
178
+ - o
179
+ - zh
180
+ - ou
181
+ - an
182
+ - m
183
+ - q
184
+ - z
185
+ - en
186
+ - g
187
+ - ing
188
+ - ei
189
+ - ao
190
+ - uo
191
+ - ang
192
+ - eng
193
+ - t
194
+ - ong
195
+ - a
196
+ - ui
197
+ - f
198
+ - k
199
+ - r
200
+ - ch
201
+ - v
202
+ - iang
203
+ - in
204
+ - iao
205
+ - ie
206
+ - iu
207
+ - c
208
+ - s
209
+ - van
210
+ - p
211
+ - ve
212
+ - uan
213
+ - uang
214
+ - ia
215
+ - ua
216
+ - uai
217
+ - un
218
+ - er
219
+ - vn
220
+ - iong
221
+ - <sos/eos>
222
+ odim: null
223
+ model_conf: {}
224
+ use_preprocessor: true
225
+ token_type: phn
226
+ bpemodel: null
227
+ non_linguistic_symbols: null
228
+ cleaner: null
229
+ g2p: null
230
+ fs: 24000
231
+ frontend: s3prl
232
+ frontend_conf:
233
+ frontend_conf:
234
+ upstream: hf_mert_custom
235
+ path_or_url: m-a-p/MERT-v1-330M
236
+ download_dir: ./hub
237
+ multilayer_feature: true
238
+ score_feats_extract: syllable_score_feats
239
+ score_feats_extract_conf:
240
+ fs: 24000
241
+ n_fft: 2048
242
+ win_length: 2048
243
+ hop_length: 480
244
+ feats_extract: fbank
245
+ feats_extract_conf:
246
+ n_fft: 2048
247
+ hop_length: 480
248
+ win_length: 2048
249
+ fs: 24000
250
+ fmin: 80
251
+ fmax: 7600
252
+ n_mels: 80
253
+ normalize: null
254
+ normalize_conf: {}
255
+ svs: vits
256
+ svs_conf:
257
+ generator_type: visinger2
258
+ vocoder_generator_type: visinger2
259
+ generator_params:
260
+ hidden_channels: 192
261
+ spks: 31
262
+ global_channels: 256
263
+ segment_size: 20
264
+ text_encoder_attention_heads: 2
265
+ text_encoder_ffn_expand: 4
266
+ text_encoder_blocks: 6
267
+ text_encoder_positionwise_layer_type: conv1d
268
+ text_encoder_positionwise_conv_kernel_size: 3
269
+ text_encoder_positional_encoding_layer_type: rel_pos
270
+ text_encoder_self_attention_layer_type: rel_selfattn
271
+ text_encoder_activation_type: swish
272
+ text_encoder_normalize_before: true
273
+ text_encoder_dropout_rate: 0.1
274
+ text_encoder_positional_dropout_rate: 0.0
275
+ text_encoder_attention_dropout_rate: 0.1
276
+ use_macaron_style_in_text_encoder: true
277
+ use_conformer_conv_in_text_encoder: false
278
+ text_encoder_conformer_kernel_size: -1
279
+ decoder_kernel_size: 7
280
+ decoder_channels: 512
281
+ decoder_upsample_scales:
282
+ - 12
283
+ - 10
284
+ - 2
285
+ - 2
286
+ decoder_upsample_kernel_sizes:
287
+ - 24
288
+ - 20
289
+ - 4
290
+ - 4
291
+ decoder_resblock_kernel_sizes:
292
+ - 3
293
+ - 7
294
+ - 11
295
+ decoder_resblock_dilations:
296
+ - - 1
297
+ - 3
298
+ - 5
299
+ - - 1
300
+ - 3
301
+ - 5
302
+ - - 1
303
+ - 3
304
+ - 5
305
+ use_weight_norm_in_decoder: true
306
+ posterior_encoder_kernel_size: 3
307
+ posterior_encoder_layers: 8
308
+ posterior_encoder_stacks: 1
309
+ posterior_encoder_base_dilation: 1
310
+ posterior_encoder_dropout_rate: 0.0
311
+ use_weight_norm_in_posterior_encoder: true
312
+ flow_flows: -1
313
+ flow_kernel_size: 5
314
+ flow_base_dilation: 1
315
+ flow_layers: 4
316
+ flow_dropout_rate: 0.0
317
+ use_weight_norm_in_flow: true
318
+ use_only_mean_in_flow: true
319
+ use_phoneme_predictor: false
320
+ vocabs: 63
321
+ aux_channels: 80
322
+ generator_type: visinger2
323
+ vocoder_generator_type: visinger2
324
+ fs: 24000
325
+ hop_length: 480
326
+ win_length: 2048
327
+ n_fft: 2048
328
+ discriminator_type: visinger2
329
+ discriminator_params:
330
+ scales: 1
331
+ scale_downsample_pooling: AvgPool1d
332
+ scale_downsample_pooling_params:
333
+ kernel_size: 4
334
+ stride: 2
335
+ padding: 2
336
+ scale_discriminator_params:
337
+ in_channels: 1
338
+ out_channels: 1
339
+ kernel_sizes:
340
+ - 15
341
+ - 41
342
+ - 5
343
+ - 3
344
+ channels: 128
345
+ max_downsample_channels: 1024
346
+ max_groups: 256
347
+ bias: true
348
+ downsample_scales:
349
+ - 4
350
+ - 4
351
+ - 4
352
+ - 4
353
+ nonlinear_activation: LeakyReLU
354
+ nonlinear_activation_params:
355
+ negative_slope: 0.1
356
+ use_weight_norm: true
357
+ use_spectral_norm: false
358
+ follow_official_norm: false
359
+ periods:
360
+ - 2
361
+ - 3
362
+ - 5
363
+ - 7
364
+ - 11
365
+ period_discriminator_params:
366
+ in_channels: 1
367
+ out_channels: 1
368
+ kernel_sizes:
369
+ - 5
370
+ - 3
371
+ channels: 32
372
+ downsample_scales:
373
+ - 3
374
+ - 3
375
+ - 3
376
+ - 3
377
+ - 1
378
+ max_downsample_channels: 1024
379
+ bias: true
380
+ nonlinear_activation: LeakyReLU
381
+ nonlinear_activation_params:
382
+ negative_slope: 0.1
383
+ use_weight_norm: true
384
+ use_spectral_norm: false
385
+ multi_freq_disc_params:
386
+ hidden_channels:
387
+ - 256
388
+ - 256
389
+ - 256
390
+ - 256
391
+ - 256
392
+ domain: double
393
+ mel_scale: true
394
+ divisors:
395
+ - 32
396
+ - 16
397
+ - 8
398
+ - 4
399
+ - 2
400
+ - 1
401
+ - 1
402
+ strides:
403
+ - 1
404
+ - 2
405
+ - 1
406
+ - 2
407
+ - 1
408
+ - 2
409
+ - 1
410
+ sample_rate: 24000
411
+ hop_lengths:
412
+ - 60
413
+ - 120
414
+ - 180
415
+ - 240
416
+ - 300
417
+ - 360
418
+ generator_adv_loss_params:
419
+ average_by_discriminators: false
420
+ loss_type: mse
421
+ discriminator_adv_loss_params:
422
+ average_by_discriminators: false
423
+ loss_type: mse
424
+ feat_match_loss_params:
425
+ average_by_discriminators: false
426
+ average_by_layers: false
427
+ include_final_outputs: true
428
+ mel_loss_params:
429
+ fs: 24000
430
+ n_fft: 2048
431
+ hop_length: 480
432
+ win_length: 2048
433
+ window: hann
434
+ n_mels: 80
435
+ fmin: 0
436
+ fmax: 12000
437
+ log_base: null
438
+ lambda_adv: 1.0
439
+ lambda_mel: 45.0
440
+ lambda_feat_match: 2.0
441
+ lambda_dur: 0.1
442
+ lambda_pitch: 10.0
443
+ lambda_phoneme: 1.0
444
+ lambda_kl: 1.0
445
+ sampling_rate: 24000
446
+ cache_generator_outputs: true
447
+ pitch_extract: dio
448
+ pitch_extract_conf:
449
+ use_token_averaged_f0: false
450
+ use_log_f0: false
451
+ fs: 24000
452
+ n_fft: 2048
453
+ hop_length: 480
454
+ f0max: 800
455
+ f0min: 80
456
+ pitch_normalize: null
457
+ pitch_normalize_conf: {}
458
+ ying_extract: null
459
+ ying_extract_conf: {}
460
+ energy_extract: null
461
+ energy_extract_conf: {}
462
+ energy_normalize: null
463
+ energy_normalize_conf: {}
464
+ required:
465
+ - output_dir
466
+ - token_list
467
+ version: '202310'
468
+ distributed: false
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/discriminator_backward_time.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/discriminator_fake_loss.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/discriminator_forward_time.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/discriminator_loss.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/discriminator_optim_step_time.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/discriminator_real_loss.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/discriminator_train_time.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_adv_loss.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_backward_time.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_feat_match_loss.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_forward_time.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_kl_loss.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_loss.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_mel_am_loss.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_mel_ddsp_loss.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_mel_loss.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_optim_step_time.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_phn_dur_loss.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_pitch_loss.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_score_dur_loss.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/generator_train_time.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/gpu_max_cached_mem_GB.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/iter_time.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/optim0_lr0.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/optim1_lr0.png ADDED
exp/24k/svs_train_visinger_mert_raw_phn_None_zh/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202409'
2
+ files:
3
+ model_file: exp/24k/svs_train_visinger_mert_raw_phn_None_zh/200epoch.pth
4
+ python: "3.9.18 (main, Sep 11 2023, 13:41:44) \n[GCC 11.2.0]"
5
+ timestamp: 1734275525.345454
6
+ torch: 2.0.0.dev20230206+cu118
7
+ yaml_files:
8
+ train_config: exp/24k/svs_train_visinger_mert_raw_phn_None_zh/config.yaml