C44 commited on
Commit
94f44b5
·
verified ·
1 Parent(s): 6586015

Upload 24 files

Browse files
Files changed (24) hide show
  1. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/100epoch.pth +3 -0
  2. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/config.yaml +408 -0
  3. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/discriminator_backward_time.png +0 -0
  4. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/discriminator_fake_loss.png +0 -0
  5. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/discriminator_forward_time.png +0 -0
  6. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/discriminator_loss.png +0 -0
  7. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/discriminator_optim_step_time.png +0 -0
  8. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/discriminator_real_loss.png +0 -0
  9. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/discriminator_train_time.png +0 -0
  10. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/generator_adv_loss.png +0 -0
  11. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/generator_backward_time.png +0 -0
  12. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/generator_dur_loss.png +0 -0
  13. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/generator_feat_match_loss.png +0 -0
  14. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/generator_forward_time.png +0 -0
  15. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/generator_kl_loss.png +0 -0
  16. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/generator_loss.png +0 -0
  17. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/generator_mel_loss.png +0 -0
  18. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/generator_optim_step_time.png +0 -0
  19. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/generator_train_time.png +0 -0
  20. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/gpu_max_cached_mem_GB.png +0 -0
  21. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/iter_time.png +0 -0
  22. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/optim0_lr0.png +0 -0
  23. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/optim1_lr0.png +0 -0
  24. exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/train_time.png +0 -0
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/100epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6207f8a0abcac79579f276321b72c04b9355ea84c00d5edfe03e5a16e9503cc
3
+ size 373277454
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/config.yaml ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: ./conf/tuning/finetune_full_band_vits.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause
9
+ ngpu: 1
10
+ seed: 777
11
+ num_workers: 4
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: false
27
+ collect_stats: false
28
+ write_collected_feats: false
29
+ max_epoch: 100
30
+ patience: null
31
+ val_scheduler_criterion:
32
+ - valid
33
+ - loss
34
+ early_stopping_criterion:
35
+ - valid
36
+ - loss
37
+ - min
38
+ best_model_criterion:
39
+ - - train
40
+ - total_count
41
+ - max
42
+ keep_nbest_models: 10
43
+ nbest_averaging_interval: 0
44
+ grad_clip: -1
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 1
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: 50
53
+ use_matplotlib: true
54
+ use_tensorboard: true
55
+ create_graph_in_tensorboard: false
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ use_lora: false
64
+ save_lora_only: true
65
+ lora_conf: {}
66
+ pretrain_path: null
67
+ init_param:
68
+ - downloads/c65dd99aa55a3c4fd6fcb15d3804e5cd/exp/tts_train_full_band_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train.total_count.ave_10best.pth:tts:tts
69
+ ignore_init_mismatch: false
70
+ freeze_param: []
71
+ num_iters_per_epoch: 1000
72
+ batch_size: 20
73
+ valid_batch_size: null
74
+ batch_bins: 100000
75
+ valid_batch_bins: null
76
+ train_shape_file:
77
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/text_shape.phn
78
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/speech_shape
79
+ valid_shape_file:
80
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/valid/text_shape.phn
81
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/valid/speech_shape
82
+ batch_type: numel
83
+ valid_batch_type: null
84
+ fold_length:
85
+ - 150
86
+ - 409600
87
+ sort_in_batch: descending
88
+ shuffle_within_batch: false
89
+ sort_batch: descending
90
+ multiple_iterator: false
91
+ chunk_length: 500
92
+ chunk_shift_ratio: 0.5
93
+ num_cache_chunks: 1024
94
+ chunk_excluded_key_prefixes: []
95
+ train_data_path_and_name_and_type:
96
+ - - dump/44k/raw/tr_no_dev/text
97
+ - text
98
+ - text
99
+ - - dump/44k/raw/tr_no_dev/wav.scp
100
+ - speech
101
+ - sound
102
+ valid_data_path_and_name_and_type:
103
+ - - dump/44k/raw/dev/text
104
+ - text
105
+ - text
106
+ - - dump/44k/raw/dev/wav.scp
107
+ - speech
108
+ - sound
109
+ allow_variable_data_keys: false
110
+ max_cache_size: 0.0
111
+ max_cache_fd: 32
112
+ allow_multi_rates: false
113
+ valid_max_cache_size: null
114
+ exclude_weight_decay: false
115
+ exclude_weight_decay_conf: {}
116
+ optim: adamw
117
+ optim_conf:
118
+ lr: 0.0001
119
+ betas:
120
+ - 0.8
121
+ - 0.99
122
+ eps: 1.0e-09
123
+ weight_decay: 0.0
124
+ scheduler: exponentiallr
125
+ scheduler_conf:
126
+ gamma: 0.999875
127
+ optim2: adamw
128
+ optim2_conf:
129
+ lr: 0.0001
130
+ betas:
131
+ - 0.8
132
+ - 0.99
133
+ eps: 1.0e-09
134
+ weight_decay: 0.0
135
+ scheduler2: exponentiallr
136
+ scheduler2_conf:
137
+ gamma: 0.999875
138
+ generator_first: false
139
+ token_list:
140
+ - <blank>
141
+ - <unk>
142
+ - '1'
143
+ - '2'
144
+ - '0'
145
+ - '3'
146
+ - '4'
147
+ - '-1'
148
+ - '5'
149
+ - a
150
+ - o
151
+ - '-2'
152
+ - i
153
+ - '-3'
154
+ - u
155
+ - e
156
+ - k
157
+ - n
158
+ - t
159
+ - '6'
160
+ - r
161
+ - '-4'
162
+ - s
163
+ - N
164
+ - m
165
+ - pau
166
+ - '7'
167
+ - sh
168
+ - d
169
+ - g
170
+ - w
171
+ - '8'
172
+ - U
173
+ - '-5'
174
+ - I
175
+ - cl
176
+ - h
177
+ - y
178
+ - b
179
+ - '9'
180
+ - j
181
+ - ts
182
+ - ch
183
+ - '-6'
184
+ - z
185
+ - p
186
+ - '-7'
187
+ - f
188
+ - ky
189
+ - ry
190
+ - '-8'
191
+ - gy
192
+ - '-9'
193
+ - hy
194
+ - ny
195
+ - '-10'
196
+ - by
197
+ - my
198
+ - '-11'
199
+ - '-12'
200
+ - '-13'
201
+ - py
202
+ - '-14'
203
+ - '-15'
204
+ - v
205
+ - '10'
206
+ - '-16'
207
+ - '-17'
208
+ - '11'
209
+ - '-21'
210
+ - '-20'
211
+ - '12'
212
+ - '-19'
213
+ - '13'
214
+ - '-18'
215
+ - '14'
216
+ - dy
217
+ - '15'
218
+ - ty
219
+ - '-22'
220
+ - '16'
221
+ - '18'
222
+ - '19'
223
+ - '17'
224
+ - <sos/eos>
225
+ odim: null
226
+ model_conf: {}
227
+ use_preprocessor: true
228
+ token_type: phn
229
+ bpemodel: null
230
+ non_linguistic_symbols: null
231
+ cleaner: jaconv
232
+ g2p: pyopenjtalk_accent_with_pause
233
+ feats_extract: linear_spectrogram
234
+ feats_extract_conf:
235
+ n_fft: 2048
236
+ hop_length: 512
237
+ win_length: null
238
+ normalize: null
239
+ normalize_conf: {}
240
+ tts: vits
241
+ tts_conf:
242
+ generator_type: vits_generator
243
+ generator_params:
244
+ hidden_channels: 192
245
+ spks: -1
246
+ global_channels: -1
247
+ segment_size: 32
248
+ text_encoder_attention_heads: 2
249
+ text_encoder_ffn_expand: 4
250
+ text_encoder_blocks: 6
251
+ text_encoder_positionwise_layer_type: conv1d
252
+ text_encoder_positionwise_conv_kernel_size: 3
253
+ text_encoder_positional_encoding_layer_type: rel_pos
254
+ text_encoder_self_attention_layer_type: rel_selfattn
255
+ text_encoder_activation_type: swish
256
+ text_encoder_normalize_before: true
257
+ text_encoder_dropout_rate: 0.1
258
+ text_encoder_positional_dropout_rate: 0.0
259
+ text_encoder_attention_dropout_rate: 0.1
260
+ use_macaron_style_in_text_encoder: true
261
+ use_conformer_conv_in_text_encoder: false
262
+ text_encoder_conformer_kernel_size: -1
263
+ decoder_kernel_size: 7
264
+ decoder_channels: 512
265
+ decoder_upsample_scales:
266
+ - 8
267
+ - 8
268
+ - 2
269
+ - 2
270
+ - 2
271
+ decoder_upsample_kernel_sizes:
272
+ - 16
273
+ - 16
274
+ - 4
275
+ - 4
276
+ - 4
277
+ decoder_resblock_kernel_sizes:
278
+ - 3
279
+ - 7
280
+ - 11
281
+ decoder_resblock_dilations:
282
+ - - 1
283
+ - 3
284
+ - 5
285
+ - - 1
286
+ - 3
287
+ - 5
288
+ - - 1
289
+ - 3
290
+ - 5
291
+ use_weight_norm_in_decoder: true
292
+ posterior_encoder_kernel_size: 5
293
+ posterior_encoder_layers: 16
294
+ posterior_encoder_stacks: 1
295
+ posterior_encoder_base_dilation: 1
296
+ posterior_encoder_dropout_rate: 0.0
297
+ use_weight_norm_in_posterior_encoder: true
298
+ flow_flows: 4
299
+ flow_kernel_size: 5
300
+ flow_base_dilation: 1
301
+ flow_layers: 4
302
+ flow_dropout_rate: 0.0
303
+ use_weight_norm_in_flow: true
304
+ use_only_mean_in_flow: true
305
+ stochastic_duration_predictor_kernel_size: 3
306
+ stochastic_duration_predictor_dropout_rate: 0.5
307
+ stochastic_duration_predictor_flows: 4
308
+ stochastic_duration_predictor_dds_conv_layers: 3
309
+ vocabs: 85
310
+ aux_channels: 1025
311
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
312
+ discriminator_params:
313
+ scales: 1
314
+ scale_downsample_pooling: AvgPool1d
315
+ scale_downsample_pooling_params:
316
+ kernel_size: 4
317
+ stride: 2
318
+ padding: 2
319
+ scale_discriminator_params:
320
+ in_channels: 1
321
+ out_channels: 1
322
+ kernel_sizes:
323
+ - 15
324
+ - 41
325
+ - 5
326
+ - 3
327
+ channels: 128
328
+ max_downsample_channels: 1024
329
+ max_groups: 16
330
+ bias: true
331
+ downsample_scales:
332
+ - 2
333
+ - 2
334
+ - 4
335
+ - 4
336
+ - 1
337
+ nonlinear_activation: LeakyReLU
338
+ nonlinear_activation_params:
339
+ negative_slope: 0.1
340
+ use_weight_norm: true
341
+ use_spectral_norm: false
342
+ follow_official_norm: false
343
+ periods:
344
+ - 2
345
+ - 3
346
+ - 5
347
+ - 7
348
+ - 11
349
+ period_discriminator_params:
350
+ in_channels: 1
351
+ out_channels: 1
352
+ kernel_sizes:
353
+ - 5
354
+ - 3
355
+ channels: 32
356
+ downsample_scales:
357
+ - 3
358
+ - 3
359
+ - 3
360
+ - 3
361
+ - 1
362
+ max_downsample_channels: 1024
363
+ bias: true
364
+ nonlinear_activation: LeakyReLU
365
+ nonlinear_activation_params:
366
+ negative_slope: 0.1
367
+ use_weight_norm: true
368
+ use_spectral_norm: false
369
+ generator_adv_loss_params:
370
+ average_by_discriminators: false
371
+ loss_type: mse
372
+ discriminator_adv_loss_params:
373
+ average_by_discriminators: false
374
+ loss_type: mse
375
+ feat_match_loss_params:
376
+ average_by_discriminators: false
377
+ average_by_layers: false
378
+ include_final_outputs: true
379
+ mel_loss_params:
380
+ fs: 44100
381
+ n_fft: 2048
382
+ hop_length: 512
383
+ win_length: null
384
+ window: hann
385
+ n_mels: 80
386
+ fmin: 0
387
+ fmax: null
388
+ log_base: null
389
+ lambda_adv: 1.0
390
+ lambda_mel: 45.0
391
+ lambda_feat_match: 2.0
392
+ lambda_dur: 1.0
393
+ lambda_kl: 1.0
394
+ sampling_rate: 44100
395
+ cache_generator_outputs: true
396
+ pitch_extract: null
397
+ pitch_extract_conf: {}
398
+ pitch_normalize: null
399
+ pitch_normalize_conf: {}
400
+ energy_extract: null
401
+ energy_extract_conf: {}
402
+ energy_normalize: null
403
+ energy_normalize_conf: {}
404
+ required:
405
+ - output_dir
406
+ - token_list
407
+ version: '202308'
408
+ distributed: false
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/discriminator_backward_time.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/discriminator_fake_loss.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/discriminator_forward_time.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/discriminator_loss.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/discriminator_optim_step_time.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/discriminator_real_loss.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/discriminator_train_time.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/generator_adv_loss.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/generator_backward_time.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/generator_dur_loss.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/generator_feat_match_loss.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/generator_forward_time.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/generator_kl_loss.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/generator_loss.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/generator_mel_loss.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/generator_optim_step_time.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/generator_train_time.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/iter_time.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/optim0_lr0.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/optim1_lr0.png ADDED
exp/tts_finetune_full_band_vits_raw_phn_jaconv_pyopenjtalk/images/train_time.png ADDED