imenLa commited on
Commit
c045d56
·
verified ·
1 Parent(s): 9f2fb99

Upload 423 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. exp/tts_stats_raw_phn_none/logdir/stats.1.log +116 -0
  2. exp/tts_stats_raw_phn_none/logdir/stats.10/config.yaml +267 -0
  3. exp/tts_stats_raw_phn_none/logdir/stats.10/train/batch_keys +2 -0
  4. exp/tts_stats_raw_phn_none/logdir/stats.10/train/feats_lengths_stats.npz +3 -0
  5. exp/tts_stats_raw_phn_none/logdir/stats.10/train/feats_stats.npz +3 -0
  6. exp/tts_stats_raw_phn_none/logdir/stats.10/train/speech_shape +43 -0
  7. exp/tts_stats_raw_phn_none/logdir/stats.10/train/stats_keys +2 -0
  8. exp/tts_stats_raw_phn_none/logdir/stats.10/train/text_shape +43 -0
  9. exp/tts_stats_raw_phn_none/logdir/stats.10/valid/batch_keys +2 -0
  10. exp/tts_stats_raw_phn_none/logdir/stats.10/valid/feats_lengths_stats.npz +3 -0
  11. exp/tts_stats_raw_phn_none/logdir/stats.10/valid/feats_stats.npz +3 -0
  12. exp/tts_stats_raw_phn_none/logdir/stats.10/valid/speech_shape +2 -0
  13. exp/tts_stats_raw_phn_none/logdir/stats.10/valid/stats_keys +2 -0
  14. exp/tts_stats_raw_phn_none/logdir/stats.10/valid/text_shape +2 -0
  15. exp/tts_stats_raw_phn_none/logdir/stats.11.log +116 -0
  16. exp/tts_stats_raw_phn_none/logdir/stats.12.log +116 -0
  17. exp/tts_stats_raw_phn_none/logdir/stats.12/config.yaml +267 -0
  18. exp/tts_stats_raw_phn_none/logdir/stats.12/train/batch_keys +2 -0
  19. exp/tts_stats_raw_phn_none/logdir/stats.12/train/feats_lengths_stats.npz +3 -0
  20. exp/tts_stats_raw_phn_none/logdir/stats.12/train/feats_stats.npz +3 -0
  21. exp/tts_stats_raw_phn_none/logdir/stats.12/train/speech_shape +43 -0
  22. exp/tts_stats_raw_phn_none/logdir/stats.12/train/stats_keys +2 -0
  23. exp/tts_stats_raw_phn_none/logdir/stats.12/train/text_shape +43 -0
  24. exp/tts_stats_raw_phn_none/logdir/stats.12/valid/batch_keys +2 -0
  25. exp/tts_stats_raw_phn_none/logdir/stats.12/valid/feats_lengths_stats.npz +3 -0
  26. exp/tts_stats_raw_phn_none/logdir/stats.12/valid/feats_stats.npz +3 -0
  27. exp/tts_stats_raw_phn_none/logdir/stats.12/valid/speech_shape +2 -0
  28. exp/tts_stats_raw_phn_none/logdir/stats.12/valid/stats_keys +2 -0
  29. exp/tts_stats_raw_phn_none/logdir/stats.12/valid/text_shape +2 -0
  30. exp/tts_stats_raw_phn_none/logdir/stats.13.log +116 -0
  31. exp/tts_stats_raw_phn_none/logdir/stats.14.log +116 -0
  32. exp/tts_stats_raw_phn_none/logdir/stats.15.log +116 -0
  33. exp/tts_stats_raw_phn_none/logdir/stats.15/config.yaml +267 -0
  34. exp/tts_stats_raw_phn_none/logdir/stats.15/train/batch_keys +2 -0
  35. exp/tts_stats_raw_phn_none/logdir/stats.15/train/feats_lengths_stats.npz +3 -0
  36. exp/tts_stats_raw_phn_none/logdir/stats.15/train/feats_stats.npz +3 -0
  37. exp/tts_stats_raw_phn_none/logdir/stats.15/train/speech_shape +43 -0
  38. exp/tts_stats_raw_phn_none/logdir/stats.15/train/stats_keys +2 -0
  39. exp/tts_stats_raw_phn_none/logdir/stats.15/train/text_shape +43 -0
  40. exp/tts_stats_raw_phn_none/logdir/stats.15/valid/batch_keys +2 -0
  41. exp/tts_stats_raw_phn_none/logdir/stats.15/valid/feats_lengths_stats.npz +3 -0
  42. exp/tts_stats_raw_phn_none/logdir/stats.15/valid/feats_stats.npz +3 -0
  43. exp/tts_stats_raw_phn_none/logdir/stats.15/valid/speech_shape +2 -0
  44. exp/tts_stats_raw_phn_none/logdir/stats.15/valid/stats_keys +2 -0
  45. exp/tts_stats_raw_phn_none/logdir/stats.15/valid/text_shape +2 -0
  46. exp/tts_stats_raw_phn_none/logdir/stats.17.log +116 -0
  47. exp/tts_stats_raw_phn_none/logdir/stats.17/config.yaml +267 -0
  48. exp/tts_stats_raw_phn_none/logdir/stats.17/train/batch_keys +2 -0
  49. exp/tts_stats_raw_phn_none/logdir/stats.17/train/feats_lengths_stats.npz +3 -0
  50. exp/tts_stats_raw_phn_none/logdir/stats.17/train/feats_stats.npz +3 -0
exp/tts_stats_raw_phn_none/logdir/stats.1.log ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.1.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.1.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.1 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
2
+ # Started at Thu Jul 13 14:09:11 UTC 2023
3
+ #
4
+ /opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
5
+ warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
6
+ /opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.1.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.1.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.1 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
7
+ [7850374a3496] 2023-07-13 14:09:21,971 (tts:293) INFO: Vocabulary size: 79
8
+ [7850374a3496] 2023-07-13 14:09:22,770 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
9
+ [7850374a3496] 2023-07-13 14:09:22,773 (abs_task:1204) INFO: Model structure:
10
+ ESPnetTTSModel(
11
+ (feats_extract): LogMelFbank(
12
+ (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
13
+ (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
14
+ )
15
+ (tts): Tacotron2(
16
+ (enc): Encoder(
17
+ (embed): Embedding(79, 512, padding_idx=0)
18
+ (convs): ModuleList(
19
+ (0-2): 3 x Sequential(
20
+ (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
21
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
22
+ (2): ReLU()
23
+ (3): Dropout(p=0.5, inplace=False)
24
+ )
25
+ )
26
+ (blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
27
+ )
28
+ (dec): Decoder(
29
+ (att): AttLoc(
30
+ (mlp_enc): Linear(in_features=512, out_features=512, bias=True)
31
+ (mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
32
+ (mlp_att): Linear(in_features=32, out_features=512, bias=False)
33
+ (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
34
+ (gvec): Linear(in_features=512, out_features=1, bias=True)
35
+ )
36
+ (lstm): ModuleList(
37
+ (0): ZoneOutCell(
38
+ (cell): LSTMCell(768, 1024)
39
+ )
40
+ (1): ZoneOutCell(
41
+ (cell): LSTMCell(1024, 1024)
42
+ )
43
+ )
44
+ (prenet): Prenet(
45
+ (prenet): ModuleList(
46
+ (0): Sequential(
47
+ (0): Linear(in_features=80, out_features=256, bias=True)
48
+ (1): ReLU()
49
+ )
50
+ (1): Sequential(
51
+ (0): Linear(in_features=256, out_features=256, bias=True)
52
+ (1): ReLU()
53
+ )
54
+ )
55
+ )
56
+ (postnet): Postnet(
57
+ (postnet): ModuleList(
58
+ (0): Sequential(
59
+ (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
60
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
61
+ (2): Tanh()
62
+ (3): Dropout(p=0.5, inplace=False)
63
+ )
64
+ (1-3): 3 x Sequential(
65
+ (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
66
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
67
+ (2): Tanh()
68
+ (3): Dropout(p=0.5, inplace=False)
69
+ )
70
+ (4): Sequential(
71
+ (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
72
+ (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
73
+ (2): Dropout(p=0.5, inplace=False)
74
+ )
75
+ )
76
+ )
77
+ (feat_out): Linear(in_features=1536, out_features=240, bias=False)
78
+ (prob_out): Linear(in_features=1536, out_features=3, bias=True)
79
+ )
80
+ (taco2_loss): Tacotron2Loss(
81
+ (l1_criterion): L1Loss()
82
+ (mse_criterion): MSELoss()
83
+ (bce_criterion): BCEWithLogitsLoss()
84
+ )
85
+ (attn_loss): GuidedAttentionLoss()
86
+ )
87
+ )
88
+
89
+ Model summary:
90
+ Class Name: ESPnetTTSModel
91
+ Total Number of model parameters: 26.91 M
92
+ Number of trainable parameters: 26.91 M (100.0%)
93
+ Size: 107.63 MB
94
+ Type: torch.float32
95
+ [7850374a3496] 2023-07-13 14:09:22,773 (abs_task:1207) INFO: Optimizer:
96
+ Adam (
97
+ Parameter Group 0
98
+ amsgrad: False
99
+ betas: (0.9, 0.999)
100
+ capturable: False
101
+ differentiable: False
102
+ eps: 1e-06
103
+ foreach: None
104
+ fused: None
105
+ lr: 0.001
106
+ maximize: False
107
+ weight_decay: 0.0
108
+ )
109
+ [7850374a3496] 2023-07-13 14:09:22,773 (abs_task:1208) INFO: Scheduler: None
110
+ [7850374a3496] 2023-07-13 14:09:22,773 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.1/config.yaml
111
+ [7850374a3496] 2023-07-13 14:09:22,799 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.1', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.1.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.1.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
112
+ /opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
113
+ Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
114
+ return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined]
115
+ # Accounting: time=16 threads=1
116
+ # Ended (code 0) at Thu Jul 13 14:09:27 UTC 2023, elapsed time 16 seconds
exp/tts_stats_raw_phn_none/logdir/stats.10/config.yaml ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/finetune_tacotron2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_stats_raw_phn_none/logdir/stats.10
7
+ ngpu: 0
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: null
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: true
26
+ write_collected_feats: false
27
+ max_epoch: 120
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 5
44
+ nbest_averaging_interval: 0
45
+ grad_clip: 1.0
46
+ grad_clip_type: 2.0
47
+ grad_noise: false
48
+ accum_grad: 1
49
+ no_forward_run: false
50
+ resume: false
51
+ train_dtype: float32
52
+ use_amp: false
53
+ log_interval: null
54
+ use_matplotlib: true
55
+ use_tensorboard: true
56
+ create_graph_in_tensorboard: false
57
+ use_wandb: false
58
+ wandb_project: null
59
+ wandb_id: null
60
+ wandb_entity: null
61
+ wandb_name: null
62
+ wandb_model_log_interval: -1
63
+ detect_anomaly: false
64
+ pretrain_path: null
65
+ init_param: []
66
+ ignore_init_mismatch: false
67
+ freeze_param: []
68
+ num_iters_per_epoch: 200
69
+ batch_size: 20
70
+ valid_batch_size: null
71
+ batch_bins: 1600000
72
+ valid_batch_bins: null
73
+ train_shape_file:
74
+ - exp/tts_stats_raw_phn_none/logdir/train.10.scp
75
+ valid_shape_file:
76
+ - exp/tts_stats_raw_phn_none/logdir/valid.10.scp
77
+ batch_type: numel
78
+ valid_batch_type: null
79
+ fold_length: []
80
+ sort_in_batch: descending
81
+ sort_batch: descending
82
+ multiple_iterator: false
83
+ chunk_length: 500
84
+ chunk_shift_ratio: 0.5
85
+ num_cache_chunks: 1024
86
+ chunk_excluded_key_prefixes: []
87
+ train_data_path_and_name_and_type:
88
+ - - dump/raw/train/text
89
+ - text
90
+ - text
91
+ - - dump/raw/train/wav.scp
92
+ - speech
93
+ - sound
94
+ valid_data_path_and_name_and_type:
95
+ - - dump/raw/dev/text
96
+ - text
97
+ - text
98
+ - - dump/raw/dev/wav.scp
99
+ - speech
100
+ - sound
101
+ allow_variable_data_keys: false
102
+ max_cache_size: 0.0
103
+ max_cache_fd: 32
104
+ valid_max_cache_size: null
105
+ exclude_weight_decay: false
106
+ exclude_weight_decay_conf: {}
107
+ optim: adam
108
+ optim_conf:
109
+ lr: 0.001
110
+ eps: 1.0e-06
111
+ weight_decay: 0.0
112
+ scheduler: null
113
+ scheduler_conf: {}
114
+ token_list:
115
+ - <blank>
116
+ - <unk>
117
+ - a
118
+ - sil
119
+ - l
120
+ - aa
121
+ - m
122
+ - ii0
123
+ - t
124
+ - <
125
+ - n
126
+ - r
127
+ - E
128
+ - i0
129
+ - b
130
+ - uu0
131
+ - f
132
+ - i1
133
+ - k
134
+ - w
135
+ - A
136
+ - s
137
+ - y
138
+ - d
139
+ - q
140
+ - h
141
+ - H
142
+ - $
143
+ - u0
144
+ - AA
145
+ - j
146
+ - T
147
+ - x
148
+ - S
149
+ - z
150
+ - ll
151
+ - I1
152
+ - D
153
+ - II0
154
+ - g
155
+ - tt
156
+ - rr
157
+ - I0
158
+ - UU0
159
+ - dd
160
+ - u1
161
+ - U0
162
+ - mm
163
+ - nn
164
+ - '*'
165
+ - $$
166
+ - bb
167
+ - yy
168
+ - ss
169
+ - jj
170
+ - ww
171
+ - ^
172
+ - SS
173
+ - TT
174
+ - Z
175
+ - zz
176
+ - kk
177
+ - U1
178
+ - HH
179
+ - ff
180
+ - qq
181
+ - xx
182
+ - ^^
183
+ - DD
184
+ - hh
185
+ - EE
186
+ - ZZ
187
+ - '**'
188
+ - aaaa
189
+ - ssss
190
+ - v
191
+ - uu1
192
+ - jjjj
193
+ - <sos/eos>
194
+ odim: null
195
+ model_conf: {}
196
+ use_preprocessor: true
197
+ token_type: phn
198
+ bpemodel: null
199
+ non_linguistic_symbols: null
200
+ cleaner: null
201
+ g2p: null
202
+ feats_extract: fbank
203
+ feats_extract_conf:
204
+ n_fft: 1024
205
+ hop_length: 256
206
+ win_length: null
207
+ fs: 22050
208
+ fmin: 80
209
+ fmax: 7600
210
+ n_mels: 80
211
+ normalize: null
212
+ normalize_conf: {}
213
+ tts: tacotron2
214
+ tts_conf:
215
+ embed_dim: 512
216
+ elayers: 1
217
+ eunits: 512
218
+ econv_layers: 3
219
+ econv_chans: 512
220
+ econv_filts: 5
221
+ atype: location
222
+ adim: 512
223
+ aconv_chans: 32
224
+ aconv_filts: 15
225
+ cumulate_att_w: true
226
+ dlayers: 2
227
+ dunits: 1024
228
+ prenet_layers: 2
229
+ prenet_units: 256
230
+ postnet_layers: 5
231
+ postnet_chans: 512
232
+ postnet_filts: 5
233
+ output_activation: null
234
+ use_batch_norm: true
235
+ use_concate: true
236
+ use_residual: false
237
+ dropout_rate: 0.5
238
+ zoneout_rate: 0.1
239
+ reduction_factor: 3
240
+ spk_embed_dim: null
241
+ use_masking: true
242
+ bce_pos_weight: 20.0
243
+ use_guided_attn_loss: true
244
+ guided_attn_loss_sigma: 0.4
245
+ guided_attn_loss_lambda: 1.0
246
+ pitch_extract: null
247
+ pitch_extract_conf:
248
+ fs: 22050
249
+ n_fft: 1024
250
+ hop_length: 256
251
+ f0max: 400
252
+ f0min: 80
253
+ pitch_normalize: null
254
+ pitch_normalize_conf: {}
255
+ energy_extract: null
256
+ energy_extract_conf:
257
+ fs: 22050
258
+ n_fft: 1024
259
+ hop_length: 256
260
+ win_length: null
261
+ energy_normalize: null
262
+ energy_normalize_conf: {}
263
+ required:
264
+ - output_dir
265
+ - token_list
266
+ version: '202304'
267
+ distributed: false
exp/tts_stats_raw_phn_none/logdir/stats.10/train/batch_keys ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ text
2
+ speech
exp/tts_stats_raw_phn_none/logdir/stats.10/train/feats_lengths_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d6b9f2fd6232f4b0ca33457b5d22c02d2b17b34d24e2f9f1f2415b0ec8a15f0
3
+ size 778
exp/tts_stats_raw_phn_none/logdir/stats.10/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:057d10f13786abd5b7b6b90bea854b18ad227d34f19bb8092c488f864880dd51
3
+ size 1402
exp/tts_stats_raw_phn_none/logdir/stats.10/train/speech_shape ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 18935 142336
2
+ 18936 141568
3
+ 18943 175360
4
+ 18944 173824
5
+ 18947 190208
6
+ 18951 154368
7
+ 18955 233216
8
+ 18959 226560
9
+ 18964 163584
10
+ 18982 113664
11
+ 18989 163072
12
+ 18991 212480
13
+ 18993 175872
14
+ 18997 101888
15
+ 19 122880
16
+ 19001 217088
17
+ 19005 184832
18
+ 19010 156928
19
+ 19011 175872
20
+ 19015 139520
21
+ 19024 165888
22
+ 19028 158720
23
+ 19063 187136
24
+ 19065 144128
25
+ 19067 175616
26
+ 19075 163584
27
+ 19076 214784
28
+ 19090 172544
29
+ 19091 199936
30
+ 19095 118016
31
+ 19096 165888
32
+ 19099 159488
33
+ 191 134144
34
+ 19103 124416
35
+ 19109 132352
36
+ 19111 151740
37
+ 19113 129280
38
+ 19116 155648
39
+ 19118 174336
40
+ 19121 137472
41
+ 19122 144896
42
+ 19132 131072
43
+ 19138 135936
exp/tts_stats_raw_phn_none/logdir/stats.10/train/stats_keys ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ feats
2
+ feats_lengths
exp/tts_stats_raw_phn_none/logdir/stats.10/train/text_shape ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 18935 66
2
+ 18936 77
3
+ 18943 94
4
+ 18944 90
5
+ 18947 93
6
+ 18951 66
7
+ 18955 116
8
+ 18959 120
9
+ 18964 81
10
+ 18982 54
11
+ 18989 85
12
+ 18991 114
13
+ 18993 100
14
+ 18997 45
15
+ 19 58
16
+ 19001 132
17
+ 19005 97
18
+ 19010 82
19
+ 19011 97
20
+ 19015 72
21
+ 19024 90
22
+ 19028 71
23
+ 19063 115
24
+ 19065 84
25
+ 19067 83
26
+ 19075 78
27
+ 19076 112
28
+ 19090 92
29
+ 19091 108
30
+ 19095 62
31
+ 19096 89
32
+ 19099 87
33
+ 191 70
34
+ 19103 68
35
+ 19109 75
36
+ 19111 80
37
+ 19113 45
38
+ 19116 87
39
+ 19118 97
40
+ 19121 74
41
+ 19122 87
42
+ 19132 69
43
+ 19138 75
exp/tts_stats_raw_phn_none/logdir/stats.10/valid/batch_keys ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ text
2
+ speech
exp/tts_stats_raw_phn_none/logdir/stats.10/valid/feats_lengths_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fe87f28f6100dafb92cda513225e57bd983e4483dbefd895ad65790398958c0
3
+ size 778
exp/tts_stats_raw_phn_none/logdir/stats.10/valid/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c23bf05ba35b7d316b51347290281e31e36aca870887098c995fd8f5c860508
3
+ size 1402
exp/tts_stats_raw_phn_none/logdir/stats.10/valid/speech_shape ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 169 189952
2
+ 18237 234496
exp/tts_stats_raw_phn_none/logdir/stats.10/valid/stats_keys ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ feats
2
+ feats_lengths
exp/tts_stats_raw_phn_none/logdir/stats.10/valid/text_shape ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 169 104
2
+ 18237 134
exp/tts_stats_raw_phn_none/logdir/stats.11.log ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.11.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.11.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.11 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
2
+ # Started at Thu Jul 13 14:10:19 UTC 2023
3
+ #
4
+ /opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
5
+ warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
6
+ /opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.11.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.11.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.11 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
7
+ [7850374a3496] 2023-07-13 14:10:27,026 (tts:293) INFO: Vocabulary size: 79
8
+ [7850374a3496] 2023-07-13 14:10:27,731 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
9
+ [7850374a3496] 2023-07-13 14:10:27,734 (abs_task:1204) INFO: Model structure:
10
+ ESPnetTTSModel(
11
+ (feats_extract): LogMelFbank(
12
+ (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
13
+ (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
14
+ )
15
+ (tts): Tacotron2(
16
+ (enc): Encoder(
17
+ (embed): Embedding(79, 512, padding_idx=0)
18
+ (convs): ModuleList(
19
+ (0-2): 3 x Sequential(
20
+ (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
21
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
22
+ (2): ReLU()
23
+ (3): Dropout(p=0.5, inplace=False)
24
+ )
25
+ )
26
+ (blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
27
+ )
28
+ (dec): Decoder(
29
+ (att): AttLoc(
30
+ (mlp_enc): Linear(in_features=512, out_features=512, bias=True)
31
+ (mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
32
+ (mlp_att): Linear(in_features=32, out_features=512, bias=False)
33
+ (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
34
+ (gvec): Linear(in_features=512, out_features=1, bias=True)
35
+ )
36
+ (lstm): ModuleList(
37
+ (0): ZoneOutCell(
38
+ (cell): LSTMCell(768, 1024)
39
+ )
40
+ (1): ZoneOutCell(
41
+ (cell): LSTMCell(1024, 1024)
42
+ )
43
+ )
44
+ (prenet): Prenet(
45
+ (prenet): ModuleList(
46
+ (0): Sequential(
47
+ (0): Linear(in_features=80, out_features=256, bias=True)
48
+ (1): ReLU()
49
+ )
50
+ (1): Sequential(
51
+ (0): Linear(in_features=256, out_features=256, bias=True)
52
+ (1): ReLU()
53
+ )
54
+ )
55
+ )
56
+ (postnet): Postnet(
57
+ (postnet): ModuleList(
58
+ (0): Sequential(
59
+ (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
60
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
61
+ (2): Tanh()
62
+ (3): Dropout(p=0.5, inplace=False)
63
+ )
64
+ (1-3): 3 x Sequential(
65
+ (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
66
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
67
+ (2): Tanh()
68
+ (3): Dropout(p=0.5, inplace=False)
69
+ )
70
+ (4): Sequential(
71
+ (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
72
+ (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
73
+ (2): Dropout(p=0.5, inplace=False)
74
+ )
75
+ )
76
+ )
77
+ (feat_out): Linear(in_features=1536, out_features=240, bias=False)
78
+ (prob_out): Linear(in_features=1536, out_features=3, bias=True)
79
+ )
80
+ (taco2_loss): Tacotron2Loss(
81
+ (l1_criterion): L1Loss()
82
+ (mse_criterion): MSELoss()
83
+ (bce_criterion): BCEWithLogitsLoss()
84
+ )
85
+ (attn_loss): GuidedAttentionLoss()
86
+ )
87
+ )
88
+
89
+ Model summary:
90
+ Class Name: ESPnetTTSModel
91
+ Total Number of model parameters: 26.91 M
92
+ Number of trainable parameters: 26.91 M (100.0%)
93
+ Size: 107.63 MB
94
+ Type: torch.float32
95
+ [7850374a3496] 2023-07-13 14:10:27,734 (abs_task:1207) INFO: Optimizer:
96
+ Adam (
97
+ Parameter Group 0
98
+ amsgrad: False
99
+ betas: (0.9, 0.999)
100
+ capturable: False
101
+ differentiable: False
102
+ eps: 1e-06
103
+ foreach: None
104
+ fused: None
105
+ lr: 0.001
106
+ maximize: False
107
+ weight_decay: 0.0
108
+ )
109
+ [7850374a3496] 2023-07-13 14:10:27,734 (abs_task:1208) INFO: Scheduler: None
110
+ [7850374a3496] 2023-07-13 14:10:27,734 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.11/config.yaml
111
+ [7850374a3496] 2023-07-13 14:10:27,761 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.11', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.11.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.11.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
112
+ /opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
113
+ Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
114
+ return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined]
115
+ # Accounting: time=12 threads=1
116
+ # Ended (code 0) at Thu Jul 13 14:10:31 UTC 2023, elapsed time 12 seconds
exp/tts_stats_raw_phn_none/logdir/stats.12.log ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.12.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.12.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.12 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
2
+ # Started at Thu Jul 13 14:10:19 UTC 2023
3
+ #
4
+ /opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
5
+ warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
6
+ /opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.12.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.12.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.12 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
7
+ [7850374a3496] 2023-07-13 14:10:27,287 (tts:293) INFO: Vocabulary size: 79
8
+ [7850374a3496] 2023-07-13 14:10:27,998 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
9
+ [7850374a3496] 2023-07-13 14:10:28,001 (abs_task:1204) INFO: Model structure:
10
+ ESPnetTTSModel(
11
+ (feats_extract): LogMelFbank(
12
+ (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
13
+ (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
14
+ )
15
+ (tts): Tacotron2(
16
+ (enc): Encoder(
17
+ (embed): Embedding(79, 512, padding_idx=0)
18
+ (convs): ModuleList(
19
+ (0-2): 3 x Sequential(
20
+ (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
21
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
22
+ (2): ReLU()
23
+ (3): Dropout(p=0.5, inplace=False)
24
+ )
25
+ )
26
+ (blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
27
+ )
28
+ (dec): Decoder(
29
+ (att): AttLoc(
30
+ (mlp_enc): Linear(in_features=512, out_features=512, bias=True)
31
+ (mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
32
+ (mlp_att): Linear(in_features=32, out_features=512, bias=False)
33
+ (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
34
+ (gvec): Linear(in_features=512, out_features=1, bias=True)
35
+ )
36
+ (lstm): ModuleList(
37
+ (0): ZoneOutCell(
38
+ (cell): LSTMCell(768, 1024)
39
+ )
40
+ (1): ZoneOutCell(
41
+ (cell): LSTMCell(1024, 1024)
42
+ )
43
+ )
44
+ (prenet): Prenet(
45
+ (prenet): ModuleList(
46
+ (0): Sequential(
47
+ (0): Linear(in_features=80, out_features=256, bias=True)
48
+ (1): ReLU()
49
+ )
50
+ (1): Sequential(
51
+ (0): Linear(in_features=256, out_features=256, bias=True)
52
+ (1): ReLU()
53
+ )
54
+ )
55
+ )
56
+ (postnet): Postnet(
57
+ (postnet): ModuleList(
58
+ (0): Sequential(
59
+ (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
60
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
61
+ (2): Tanh()
62
+ (3): Dropout(p=0.5, inplace=False)
63
+ )
64
+ (1-3): 3 x Sequential(
65
+ (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
66
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
67
+ (2): Tanh()
68
+ (3): Dropout(p=0.5, inplace=False)
69
+ )
70
+ (4): Sequential(
71
+ (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
72
+ (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
73
+ (2): Dropout(p=0.5, inplace=False)
74
+ )
75
+ )
76
+ )
77
+ (feat_out): Linear(in_features=1536, out_features=240, bias=False)
78
+ (prob_out): Linear(in_features=1536, out_features=3, bias=True)
79
+ )
80
+ (taco2_loss): Tacotron2Loss(
81
+ (l1_criterion): L1Loss()
82
+ (mse_criterion): MSELoss()
83
+ (bce_criterion): BCEWithLogitsLoss()
84
+ )
85
+ (attn_loss): GuidedAttentionLoss()
86
+ )
87
+ )
88
+
89
+ Model summary:
90
+ Class Name: ESPnetTTSModel
91
+ Total Number of model parameters: 26.91 M
92
+ Number of trainable parameters: 26.91 M (100.0%)
93
+ Size: 107.63 MB
94
+ Type: torch.float32
95
+ [7850374a3496] 2023-07-13 14:10:28,001 (abs_task:1207) INFO: Optimizer:
96
+ Adam (
97
+ Parameter Group 0
98
+ amsgrad: False
99
+ betas: (0.9, 0.999)
100
+ capturable: False
101
+ differentiable: False
102
+ eps: 1e-06
103
+ foreach: None
104
+ fused: None
105
+ lr: 0.001
106
+ maximize: False
107
+ weight_decay: 0.0
108
+ )
109
+ [7850374a3496] 2023-07-13 14:10:28,001 (abs_task:1208) INFO: Scheduler: None
110
+ [7850374a3496] 2023-07-13 14:10:28,001 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.12/config.yaml
111
+ [7850374a3496] 2023-07-13 14:10:28,024 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.12', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.12.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.12.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
112
+ /opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
113
+ Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
114
+ return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined]
115
+ # Accounting: time=13 threads=1
116
+ # Ended (code 0) at Thu Jul 13 14:10:32 UTC 2023, elapsed time 13 seconds
exp/tts_stats_raw_phn_none/logdir/stats.12/config.yaml ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/finetune_tacotron2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_stats_raw_phn_none/logdir/stats.12
7
+ ngpu: 0
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: null
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: true
26
+ write_collected_feats: false
27
+ max_epoch: 120
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 5
44
+ nbest_averaging_interval: 0
45
+ grad_clip: 1.0
46
+ grad_clip_type: 2.0
47
+ grad_noise: false
48
+ accum_grad: 1
49
+ no_forward_run: false
50
+ resume: false
51
+ train_dtype: float32
52
+ use_amp: false
53
+ log_interval: null
54
+ use_matplotlib: true
55
+ use_tensorboard: true
56
+ create_graph_in_tensorboard: false
57
+ use_wandb: false
58
+ wandb_project: null
59
+ wandb_id: null
60
+ wandb_entity: null
61
+ wandb_name: null
62
+ wandb_model_log_interval: -1
63
+ detect_anomaly: false
64
+ pretrain_path: null
65
+ init_param: []
66
+ ignore_init_mismatch: false
67
+ freeze_param: []
68
+ num_iters_per_epoch: 200
69
+ batch_size: 20
70
+ valid_batch_size: null
71
+ batch_bins: 1600000
72
+ valid_batch_bins: null
73
+ train_shape_file:
74
+ - exp/tts_stats_raw_phn_none/logdir/train.12.scp
75
+ valid_shape_file:
76
+ - exp/tts_stats_raw_phn_none/logdir/valid.12.scp
77
+ batch_type: numel
78
+ valid_batch_type: null
79
+ fold_length: []
80
+ sort_in_batch: descending
81
+ sort_batch: descending
82
+ multiple_iterator: false
83
+ chunk_length: 500
84
+ chunk_shift_ratio: 0.5
85
+ num_cache_chunks: 1024
86
+ chunk_excluded_key_prefixes: []
87
+ train_data_path_and_name_and_type:
88
+ - - dump/raw/train/text
89
+ - text
90
+ - text
91
+ - - dump/raw/train/wav.scp
92
+ - speech
93
+ - sound
94
+ valid_data_path_and_name_and_type:
95
+ - - dump/raw/dev/text
96
+ - text
97
+ - text
98
+ - - dump/raw/dev/wav.scp
99
+ - speech
100
+ - sound
101
+ allow_variable_data_keys: false
102
+ max_cache_size: 0.0
103
+ max_cache_fd: 32
104
+ valid_max_cache_size: null
105
+ exclude_weight_decay: false
106
+ exclude_weight_decay_conf: {}
107
+ optim: adam
108
+ optim_conf:
109
+ lr: 0.001
110
+ eps: 1.0e-06
111
+ weight_decay: 0.0
112
+ scheduler: null
113
+ scheduler_conf: {}
114
+ token_list:
115
+ - <blank>
116
+ - <unk>
117
+ - a
118
+ - sil
119
+ - l
120
+ - aa
121
+ - m
122
+ - ii0
123
+ - t
124
+ - <
125
+ - n
126
+ - r
127
+ - E
128
+ - i0
129
+ - b
130
+ - uu0
131
+ - f
132
+ - i1
133
+ - k
134
+ - w
135
+ - A
136
+ - s
137
+ - y
138
+ - d
139
+ - q
140
+ - h
141
+ - H
142
+ - $
143
+ - u0
144
+ - AA
145
+ - j
146
+ - T
147
+ - x
148
+ - S
149
+ - z
150
+ - ll
151
+ - I1
152
+ - D
153
+ - II0
154
+ - g
155
+ - tt
156
+ - rr
157
+ - I0
158
+ - UU0
159
+ - dd
160
+ - u1
161
+ - U0
162
+ - mm
163
+ - nn
164
+ - '*'
165
+ - $$
166
+ - bb
167
+ - yy
168
+ - ss
169
+ - jj
170
+ - ww
171
+ - ^
172
+ - SS
173
+ - TT
174
+ - Z
175
+ - zz
176
+ - kk
177
+ - U1
178
+ - HH
179
+ - ff
180
+ - qq
181
+ - xx
182
+ - ^^
183
+ - DD
184
+ - hh
185
+ - EE
186
+ - ZZ
187
+ - '**'
188
+ - aaaa
189
+ - ssss
190
+ - v
191
+ - uu1
192
+ - jjjj
193
+ - <sos/eos>
194
+ odim: null
195
+ model_conf: {}
196
+ use_preprocessor: true
197
+ token_type: phn
198
+ bpemodel: null
199
+ non_linguistic_symbols: null
200
+ cleaner: null
201
+ g2p: null
202
+ feats_extract: fbank
203
+ feats_extract_conf:
204
+ n_fft: 1024
205
+ hop_length: 256
206
+ win_length: null
207
+ fs: 22050
208
+ fmin: 80
209
+ fmax: 7600
210
+ n_mels: 80
211
+ normalize: null
212
+ normalize_conf: {}
213
+ tts: tacotron2
214
+ tts_conf:
215
+ embed_dim: 512
216
+ elayers: 1
217
+ eunits: 512
218
+ econv_layers: 3
219
+ econv_chans: 512
220
+ econv_filts: 5
221
+ atype: location
222
+ adim: 512
223
+ aconv_chans: 32
224
+ aconv_filts: 15
225
+ cumulate_att_w: true
226
+ dlayers: 2
227
+ dunits: 1024
228
+ prenet_layers: 2
229
+ prenet_units: 256
230
+ postnet_layers: 5
231
+ postnet_chans: 512
232
+ postnet_filts: 5
233
+ output_activation: null
234
+ use_batch_norm: true
235
+ use_concate: true
236
+ use_residual: false
237
+ dropout_rate: 0.5
238
+ zoneout_rate: 0.1
239
+ reduction_factor: 3
240
+ spk_embed_dim: null
241
+ use_masking: true
242
+ bce_pos_weight: 20.0
243
+ use_guided_attn_loss: true
244
+ guided_attn_loss_sigma: 0.4
245
+ guided_attn_loss_lambda: 1.0
246
+ pitch_extract: null
247
+ pitch_extract_conf:
248
+ fs: 22050
249
+ n_fft: 1024
250
+ hop_length: 256
251
+ f0max: 400
252
+ f0min: 80
253
+ pitch_normalize: null
254
+ pitch_normalize_conf: {}
255
+ energy_extract: null
256
+ energy_extract_conf:
257
+ fs: 22050
258
+ n_fft: 1024
259
+ hop_length: 256
260
+ win_length: null
261
+ energy_normalize: null
262
+ energy_normalize_conf: {}
263
+ required:
264
+ - output_dir
265
+ - token_list
266
+ version: '202304'
267
+ distributed: false
exp/tts_stats_raw_phn_none/logdir/stats.12/train/batch_keys ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ text
2
+ speech
exp/tts_stats_raw_phn_none/logdir/stats.12/train/feats_lengths_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d49bfea4033ce1e51b1a17d023326b9c8fc5b58658ad92a4ab13fae6f7b8d624
3
+ size 778
exp/tts_stats_raw_phn_none/logdir/stats.12/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75c6bcb5409152fe06dfbb367a0d796774c6f1e94af5fa448137f8a901f9c284
3
+ size 1402
exp/tts_stats_raw_phn_none/logdir/stats.12/train/speech_shape ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 19360 171776
2
+ 19366 143104
3
+ 19367 199936
4
+ 19371 145920
5
+ 19372 162816
6
+ 19374 145664
7
+ 19376 201682
8
+ 19387 219904
9
+ 19396 130048
10
+ 19399 112896
11
+ 194 140032
12
+ 19400 183808
13
+ 19404 159488
14
+ 19406 186624
15
+ 19410 183552
16
+ 19413 121088
17
+ 19414 134912
18
+ 19423 198400
19
+ 19429 195328
20
+ 19439 114944
21
+ 19440 97280
22
+ 19449 159488
23
+ 19451 140032
24
+ 19454 120320
25
+ 19477 191488
26
+ 19482 157696
27
+ 19488 169472
28
+ 19496 129792
29
+ 19499 153344
30
+ 195 122624
31
+ 19501 137216
32
+ 19506 162816
33
+ 19509 143872
34
+ 19510 119040
35
+ 19511 146688
36
+ 19521 132864
37
+ 19522 167680
38
+ 19524 146944
39
+ 19529 188928
40
+ 19540 193536
41
+ 19542 179456
42
+ 19543 159669
43
+ 19548 138752
exp/tts_stats_raw_phn_none/logdir/stats.12/train/stats_keys ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ feats
2
+ feats_lengths
exp/tts_stats_raw_phn_none/logdir/stats.12/train/text_shape ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 19360 89
2
+ 19366 85
3
+ 19367 92
4
+ 19371 64
5
+ 19372 84
6
+ 19374 75
7
+ 19376 103
8
+ 19387 108
9
+ 19396 57
10
+ 19399 54
11
+ 194 62
12
+ 19400 92
13
+ 19404 99
14
+ 19406 103
15
+ 19410 102
16
+ 19413 67
17
+ 19414 60
18
+ 19423 90
19
+ 19429 90
20
+ 19439 51
21
+ 19440 33
22
+ 19449 92
23
+ 19451 67
24
+ 19454 61
25
+ 19477 97
26
+ 19482 84
27
+ 19488 93
28
+ 19496 63
29
+ 19499 65
30
+ 195 68
31
+ 19501 60
32
+ 19506 96
33
+ 19509 80
34
+ 19510 57
35
+ 19511 77
36
+ 19521 61
37
+ 19522 80
38
+ 19524 67
39
+ 19529 91
40
+ 19540 101
41
+ 19542 107
42
+ 19543 78
43
+ 19548 61
exp/tts_stats_raw_phn_none/logdir/stats.12/valid/batch_keys ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ text
2
+ speech
exp/tts_stats_raw_phn_none/logdir/stats.12/valid/feats_lengths_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f504865400934158e457d1520e847de7701d6bd8479c772a7d9710d35616c234
3
+ size 778
exp/tts_stats_raw_phn_none/logdir/stats.12/valid/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51eb4c28274a7e81976f604f330f3c2f10cc6cf6a8befcf261e3e49cbdd44ab0
3
+ size 1402
exp/tts_stats_raw_phn_none/logdir/stats.12/valid/speech_shape ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 18963 129280
2
+ 19178 177408
exp/tts_stats_raw_phn_none/logdir/stats.12/valid/stats_keys ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ feats
2
+ feats_lengths
exp/tts_stats_raw_phn_none/logdir/stats.12/valid/text_shape ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 18963 58
2
+ 19178 91
exp/tts_stats_raw_phn_none/logdir/stats.13.log ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.13.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.13.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.13 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
2
+ # Started at Thu Jul 13 14:10:31 UTC 2023
3
+ #
4
+ /opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
5
+ warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
6
+ /opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.13.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.13.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.13 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
7
+ [7850374a3496] 2023-07-13 14:10:40,144 (tts:293) INFO: Vocabulary size: 79
8
+ [7850374a3496] 2023-07-13 14:10:40,861 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
9
+ [7850374a3496] 2023-07-13 14:10:40,864 (abs_task:1204) INFO: Model structure:
10
+ ESPnetTTSModel(
11
+ (feats_extract): LogMelFbank(
12
+ (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
13
+ (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
14
+ )
15
+ (tts): Tacotron2(
16
+ (enc): Encoder(
17
+ (embed): Embedding(79, 512, padding_idx=0)
18
+ (convs): ModuleList(
19
+ (0-2): 3 x Sequential(
20
+ (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
21
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
22
+ (2): ReLU()
23
+ (3): Dropout(p=0.5, inplace=False)
24
+ )
25
+ )
26
+ (blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
27
+ )
28
+ (dec): Decoder(
29
+ (att): AttLoc(
30
+ (mlp_enc): Linear(in_features=512, out_features=512, bias=True)
31
+ (mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
32
+ (mlp_att): Linear(in_features=32, out_features=512, bias=False)
33
+ (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
34
+ (gvec): Linear(in_features=512, out_features=1, bias=True)
35
+ )
36
+ (lstm): ModuleList(
37
+ (0): ZoneOutCell(
38
+ (cell): LSTMCell(768, 1024)
39
+ )
40
+ (1): ZoneOutCell(
41
+ (cell): LSTMCell(1024, 1024)
42
+ )
43
+ )
44
+ (prenet): Prenet(
45
+ (prenet): ModuleList(
46
+ (0): Sequential(
47
+ (0): Linear(in_features=80, out_features=256, bias=True)
48
+ (1): ReLU()
49
+ )
50
+ (1): Sequential(
51
+ (0): Linear(in_features=256, out_features=256, bias=True)
52
+ (1): ReLU()
53
+ )
54
+ )
55
+ )
56
+ (postnet): Postnet(
57
+ (postnet): ModuleList(
58
+ (0): Sequential(
59
+ (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
60
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
61
+ (2): Tanh()
62
+ (3): Dropout(p=0.5, inplace=False)
63
+ )
64
+ (1-3): 3 x Sequential(
65
+ (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
66
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
67
+ (2): Tanh()
68
+ (3): Dropout(p=0.5, inplace=False)
69
+ )
70
+ (4): Sequential(
71
+ (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
72
+ (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
73
+ (2): Dropout(p=0.5, inplace=False)
74
+ )
75
+ )
76
+ )
77
+ (feat_out): Linear(in_features=1536, out_features=240, bias=False)
78
+ (prob_out): Linear(in_features=1536, out_features=3, bias=True)
79
+ )
80
+ (taco2_loss): Tacotron2Loss(
81
+ (l1_criterion): L1Loss()
82
+ (mse_criterion): MSELoss()
83
+ (bce_criterion): BCEWithLogitsLoss()
84
+ )
85
+ (attn_loss): GuidedAttentionLoss()
86
+ )
87
+ )
88
+
89
+ Model summary:
90
+ Class Name: ESPnetTTSModel
91
+ Total Number of model parameters: 26.91 M
92
+ Number of trainable parameters: 26.91 M (100.0%)
93
+ Size: 107.63 MB
94
+ Type: torch.float32
95
+ [7850374a3496] 2023-07-13 14:10:40,864 (abs_task:1207) INFO: Optimizer:
96
+ Adam (
97
+ Parameter Group 0
98
+ amsgrad: False
99
+ betas: (0.9, 0.999)
100
+ capturable: False
101
+ differentiable: False
102
+ eps: 1e-06
103
+ foreach: None
104
+ fused: None
105
+ lr: 0.001
106
+ maximize: False
107
+ weight_decay: 0.0
108
+ )
109
+ [7850374a3496] 2023-07-13 14:10:40,864 (abs_task:1208) INFO: Scheduler: None
110
+ [7850374a3496] 2023-07-13 14:10:40,864 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.13/config.yaml
111
+ [7850374a3496] 2023-07-13 14:10:40,891 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.13', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.13.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.13.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
112
+ /opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
113
+ Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
114
+ return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined]
115
+ # Accounting: time=13 threads=1
116
+ # Ended (code 0) at Thu Jul 13 14:10:44 UTC 2023, elapsed time 13 seconds
exp/tts_stats_raw_phn_none/logdir/stats.14.log ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.14.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.14.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.14 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
2
+ # Started at Thu Jul 13 14:10:32 UTC 2023
3
+ #
4
+ /opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
5
+ warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
6
+ /opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.14.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.14.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.14 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
7
+ [7850374a3496] 2023-07-13 14:10:40,319 (tts:293) INFO: Vocabulary size: 79
8
+ [7850374a3496] 2023-07-13 14:10:41,034 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
9
+ [7850374a3496] 2023-07-13 14:10:41,037 (abs_task:1204) INFO: Model structure:
10
+ ESPnetTTSModel(
11
+ (feats_extract): LogMelFbank(
12
+ (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
13
+ (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
14
+ )
15
+ (tts): Tacotron2(
16
+ (enc): Encoder(
17
+ (embed): Embedding(79, 512, padding_idx=0)
18
+ (convs): ModuleList(
19
+ (0-2): 3 x Sequential(
20
+ (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
21
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
22
+ (2): ReLU()
23
+ (3): Dropout(p=0.5, inplace=False)
24
+ )
25
+ )
26
+ (blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
27
+ )
28
+ (dec): Decoder(
29
+ (att): AttLoc(
30
+ (mlp_enc): Linear(in_features=512, out_features=512, bias=True)
31
+ (mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
32
+ (mlp_att): Linear(in_features=32, out_features=512, bias=False)
33
+ (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
34
+ (gvec): Linear(in_features=512, out_features=1, bias=True)
35
+ )
36
+ (lstm): ModuleList(
37
+ (0): ZoneOutCell(
38
+ (cell): LSTMCell(768, 1024)
39
+ )
40
+ (1): ZoneOutCell(
41
+ (cell): LSTMCell(1024, 1024)
42
+ )
43
+ )
44
+ (prenet): Prenet(
45
+ (prenet): ModuleList(
46
+ (0): Sequential(
47
+ (0): Linear(in_features=80, out_features=256, bias=True)
48
+ (1): ReLU()
49
+ )
50
+ (1): Sequential(
51
+ (0): Linear(in_features=256, out_features=256, bias=True)
52
+ (1): ReLU()
53
+ )
54
+ )
55
+ )
56
+ (postnet): Postnet(
57
+ (postnet): ModuleList(
58
+ (0): Sequential(
59
+ (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
60
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
61
+ (2): Tanh()
62
+ (3): Dropout(p=0.5, inplace=False)
63
+ )
64
+ (1-3): 3 x Sequential(
65
+ (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
66
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
67
+ (2): Tanh()
68
+ (3): Dropout(p=0.5, inplace=False)
69
+ )
70
+ (4): Sequential(
71
+ (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
72
+ (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
73
+ (2): Dropout(p=0.5, inplace=False)
74
+ )
75
+ )
76
+ )
77
+ (feat_out): Linear(in_features=1536, out_features=240, bias=False)
78
+ (prob_out): Linear(in_features=1536, out_features=3, bias=True)
79
+ )
80
+ (taco2_loss): Tacotron2Loss(
81
+ (l1_criterion): L1Loss()
82
+ (mse_criterion): MSELoss()
83
+ (bce_criterion): BCEWithLogitsLoss()
84
+ )
85
+ (attn_loss): GuidedAttentionLoss()
86
+ )
87
+ )
88
+
89
+ Model summary:
90
+ Class Name: ESPnetTTSModel
91
+ Total Number of model parameters: 26.91 M
92
+ Number of trainable parameters: 26.91 M (100.0%)
93
+ Size: 107.63 MB
94
+ Type: torch.float32
95
+ [7850374a3496] 2023-07-13 14:10:41,037 (abs_task:1207) INFO: Optimizer:
96
+ Adam (
97
+ Parameter Group 0
98
+ amsgrad: False
99
+ betas: (0.9, 0.999)
100
+ capturable: False
101
+ differentiable: False
102
+ eps: 1e-06
103
+ foreach: None
104
+ fused: None
105
+ lr: 0.001
106
+ maximize: False
107
+ weight_decay: 0.0
108
+ )
109
+ [7850374a3496] 2023-07-13 14:10:41,037 (abs_task:1208) INFO: Scheduler: None
110
+ [7850374a3496] 2023-07-13 14:10:41,037 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.14/config.yaml
111
+ [7850374a3496] 2023-07-13 14:10:41,061 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.14', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.14.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.14.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
112
+ /opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
113
+ Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
114
+ return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined]
115
+ # Accounting: time=13 threads=1
116
+ # Ended (code 0) at Thu Jul 13 14:10:45 UTC 2023, elapsed time 13 seconds
exp/tts_stats_raw_phn_none/logdir/stats.15.log ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.15.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.15.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.15 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
2
+ # Started at Thu Jul 13 14:10:44 UTC 2023
3
+ #
4
+ /opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
5
+ warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
6
+ /opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.15.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.15.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.15 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
7
+ [7850374a3496] 2023-07-13 14:10:52,286 (tts:293) INFO: Vocabulary size: 79
8
+ [7850374a3496] 2023-07-13 14:10:52,979 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
9
+ [7850374a3496] 2023-07-13 14:10:52,982 (abs_task:1204) INFO: Model structure:
10
+ ESPnetTTSModel(
11
+ (feats_extract): LogMelFbank(
12
+ (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
13
+ (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
14
+ )
15
+ (tts): Tacotron2(
16
+ (enc): Encoder(
17
+ (embed): Embedding(79, 512, padding_idx=0)
18
+ (convs): ModuleList(
19
+ (0-2): 3 x Sequential(
20
+ (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
21
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
22
+ (2): ReLU()
23
+ (3): Dropout(p=0.5, inplace=False)
24
+ )
25
+ )
26
+ (blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
27
+ )
28
+ (dec): Decoder(
29
+ (att): AttLoc(
30
+ (mlp_enc): Linear(in_features=512, out_features=512, bias=True)
31
+ (mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
32
+ (mlp_att): Linear(in_features=32, out_features=512, bias=False)
33
+ (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
34
+ (gvec): Linear(in_features=512, out_features=1, bias=True)
35
+ )
36
+ (lstm): ModuleList(
37
+ (0): ZoneOutCell(
38
+ (cell): LSTMCell(768, 1024)
39
+ )
40
+ (1): ZoneOutCell(
41
+ (cell): LSTMCell(1024, 1024)
42
+ )
43
+ )
44
+ (prenet): Prenet(
45
+ (prenet): ModuleList(
46
+ (0): Sequential(
47
+ (0): Linear(in_features=80, out_features=256, bias=True)
48
+ (1): ReLU()
49
+ )
50
+ (1): Sequential(
51
+ (0): Linear(in_features=256, out_features=256, bias=True)
52
+ (1): ReLU()
53
+ )
54
+ )
55
+ )
56
+ (postnet): Postnet(
57
+ (postnet): ModuleList(
58
+ (0): Sequential(
59
+ (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
60
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
61
+ (2): Tanh()
62
+ (3): Dropout(p=0.5, inplace=False)
63
+ )
64
+ (1-3): 3 x Sequential(
65
+ (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
66
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
67
+ (2): Tanh()
68
+ (3): Dropout(p=0.5, inplace=False)
69
+ )
70
+ (4): Sequential(
71
+ (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
72
+ (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
73
+ (2): Dropout(p=0.5, inplace=False)
74
+ )
75
+ )
76
+ )
77
+ (feat_out): Linear(in_features=1536, out_features=240, bias=False)
78
+ (prob_out): Linear(in_features=1536, out_features=3, bias=True)
79
+ )
80
+ (taco2_loss): Tacotron2Loss(
81
+ (l1_criterion): L1Loss()
82
+ (mse_criterion): MSELoss()
83
+ (bce_criterion): BCEWithLogitsLoss()
84
+ )
85
+ (attn_loss): GuidedAttentionLoss()
86
+ )
87
+ )
88
+
89
+ Model summary:
90
+ Class Name: ESPnetTTSModel
91
+ Total Number of model parameters: 26.91 M
92
+ Number of trainable parameters: 26.91 M (100.0%)
93
+ Size: 107.63 MB
94
+ Type: torch.float32
95
+ [7850374a3496] 2023-07-13 14:10:52,982 (abs_task:1207) INFO: Optimizer:
96
+ Adam (
97
+ Parameter Group 0
98
+ amsgrad: False
99
+ betas: (0.9, 0.999)
100
+ capturable: False
101
+ differentiable: False
102
+ eps: 1e-06
103
+ foreach: None
104
+ fused: None
105
+ lr: 0.001
106
+ maximize: False
107
+ weight_decay: 0.0
108
+ )
109
+ [7850374a3496] 2023-07-13 14:10:52,982 (abs_task:1208) INFO: Scheduler: None
110
+ [7850374a3496] 2023-07-13 14:10:52,982 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.15/config.yaml
111
+ [7850374a3496] 2023-07-13 14:10:53,009 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.15', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.15.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.15.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
112
+ /opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
113
+ Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
114
+ return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined]
115
+ # Accounting: time=12 threads=1
116
+ # Ended (code 0) at Thu Jul 13 14:10:56 UTC 2023, elapsed time 12 seconds
exp/tts_stats_raw_phn_none/logdir/stats.15/config.yaml ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/finetune_tacotron2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_stats_raw_phn_none/logdir/stats.15
7
+ ngpu: 0
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: null
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: true
26
+ write_collected_feats: false
27
+ max_epoch: 120
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 5
44
+ nbest_averaging_interval: 0
45
+ grad_clip: 1.0
46
+ grad_clip_type: 2.0
47
+ grad_noise: false
48
+ accum_grad: 1
49
+ no_forward_run: false
50
+ resume: false
51
+ train_dtype: float32
52
+ use_amp: false
53
+ log_interval: null
54
+ use_matplotlib: true
55
+ use_tensorboard: true
56
+ create_graph_in_tensorboard: false
57
+ use_wandb: false
58
+ wandb_project: null
59
+ wandb_id: null
60
+ wandb_entity: null
61
+ wandb_name: null
62
+ wandb_model_log_interval: -1
63
+ detect_anomaly: false
64
+ pretrain_path: null
65
+ init_param: []
66
+ ignore_init_mismatch: false
67
+ freeze_param: []
68
+ num_iters_per_epoch: 200
69
+ batch_size: 20
70
+ valid_batch_size: null
71
+ batch_bins: 1600000
72
+ valid_batch_bins: null
73
+ train_shape_file:
74
+ - exp/tts_stats_raw_phn_none/logdir/train.15.scp
75
+ valid_shape_file:
76
+ - exp/tts_stats_raw_phn_none/logdir/valid.15.scp
77
+ batch_type: numel
78
+ valid_batch_type: null
79
+ fold_length: []
80
+ sort_in_batch: descending
81
+ sort_batch: descending
82
+ multiple_iterator: false
83
+ chunk_length: 500
84
+ chunk_shift_ratio: 0.5
85
+ num_cache_chunks: 1024
86
+ chunk_excluded_key_prefixes: []
87
+ train_data_path_and_name_and_type:
88
+ - - dump/raw/train/text
89
+ - text
90
+ - text
91
+ - - dump/raw/train/wav.scp
92
+ - speech
93
+ - sound
94
+ valid_data_path_and_name_and_type:
95
+ - - dump/raw/dev/text
96
+ - text
97
+ - text
98
+ - - dump/raw/dev/wav.scp
99
+ - speech
100
+ - sound
101
+ allow_variable_data_keys: false
102
+ max_cache_size: 0.0
103
+ max_cache_fd: 32
104
+ valid_max_cache_size: null
105
+ exclude_weight_decay: false
106
+ exclude_weight_decay_conf: {}
107
+ optim: adam
108
+ optim_conf:
109
+ lr: 0.001
110
+ eps: 1.0e-06
111
+ weight_decay: 0.0
112
+ scheduler: null
113
+ scheduler_conf: {}
114
+ token_list:
115
+ - <blank>
116
+ - <unk>
117
+ - a
118
+ - sil
119
+ - l
120
+ - aa
121
+ - m
122
+ - ii0
123
+ - t
124
+ - <
125
+ - n
126
+ - r
127
+ - E
128
+ - i0
129
+ - b
130
+ - uu0
131
+ - f
132
+ - i1
133
+ - k
134
+ - w
135
+ - A
136
+ - s
137
+ - y
138
+ - d
139
+ - q
140
+ - h
141
+ - H
142
+ - $
143
+ - u0
144
+ - AA
145
+ - j
146
+ - T
147
+ - x
148
+ - S
149
+ - z
150
+ - ll
151
+ - I1
152
+ - D
153
+ - II0
154
+ - g
155
+ - tt
156
+ - rr
157
+ - I0
158
+ - UU0
159
+ - dd
160
+ - u1
161
+ - U0
162
+ - mm
163
+ - nn
164
+ - '*'
165
+ - $$
166
+ - bb
167
+ - yy
168
+ - ss
169
+ - jj
170
+ - ww
171
+ - ^
172
+ - SS
173
+ - TT
174
+ - Z
175
+ - zz
176
+ - kk
177
+ - U1
178
+ - HH
179
+ - ff
180
+ - qq
181
+ - xx
182
+ - ^^
183
+ - DD
184
+ - hh
185
+ - EE
186
+ - ZZ
187
+ - '**'
188
+ - aaaa
189
+ - ssss
190
+ - v
191
+ - uu1
192
+ - jjjj
193
+ - <sos/eos>
194
+ odim: null
195
+ model_conf: {}
196
+ use_preprocessor: true
197
+ token_type: phn
198
+ bpemodel: null
199
+ non_linguistic_symbols: null
200
+ cleaner: null
201
+ g2p: null
202
+ feats_extract: fbank
203
+ feats_extract_conf:
204
+ n_fft: 1024
205
+ hop_length: 256
206
+ win_length: null
207
+ fs: 22050
208
+ fmin: 80
209
+ fmax: 7600
210
+ n_mels: 80
211
+ normalize: null
212
+ normalize_conf: {}
213
+ tts: tacotron2
214
+ tts_conf:
215
+ embed_dim: 512
216
+ elayers: 1
217
+ eunits: 512
218
+ econv_layers: 3
219
+ econv_chans: 512
220
+ econv_filts: 5
221
+ atype: location
222
+ adim: 512
223
+ aconv_chans: 32
224
+ aconv_filts: 15
225
+ cumulate_att_w: true
226
+ dlayers: 2
227
+ dunits: 1024
228
+ prenet_layers: 2
229
+ prenet_units: 256
230
+ postnet_layers: 5
231
+ postnet_chans: 512
232
+ postnet_filts: 5
233
+ output_activation: null
234
+ use_batch_norm: true
235
+ use_concate: true
236
+ use_residual: false
237
+ dropout_rate: 0.5
238
+ zoneout_rate: 0.1
239
+ reduction_factor: 3
240
+ spk_embed_dim: null
241
+ use_masking: true
242
+ bce_pos_weight: 20.0
243
+ use_guided_attn_loss: true
244
+ guided_attn_loss_sigma: 0.4
245
+ guided_attn_loss_lambda: 1.0
246
+ pitch_extract: null
247
+ pitch_extract_conf:
248
+ fs: 22050
249
+ n_fft: 1024
250
+ hop_length: 256
251
+ f0max: 400
252
+ f0min: 80
253
+ pitch_normalize: null
254
+ pitch_normalize_conf: {}
255
+ energy_extract: null
256
+ energy_extract_conf:
257
+ fs: 22050
258
+ n_fft: 1024
259
+ hop_length: 256
260
+ win_length: null
261
+ energy_normalize: null
262
+ energy_normalize_conf: {}
263
+ required:
264
+ - output_dir
265
+ - token_list
266
+ version: '202304'
267
+ distributed: false
exp/tts_stats_raw_phn_none/logdir/stats.15/train/batch_keys ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ text
2
+ speech
exp/tts_stats_raw_phn_none/logdir/stats.15/train/feats_lengths_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7321393931081a400396aafc1edb9605c0808638ac13716f0f23942f51e167a
3
+ size 778
exp/tts_stats_raw_phn_none/logdir/stats.15/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:239fc3342b13b16bd839d1ac8b21666aad2e96b1b2341e9c6c4c53e063f99526
3
+ size 1402
exp/tts_stats_raw_phn_none/logdir/stats.15/train/speech_shape ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 19928 141056
2
+ 19931 133376
3
+ 19935 203520
4
+ 19938 102144
5
+ 19944 126464
6
+ 19946 116992
7
+ 19947 154112
8
+ 19948 171637
9
+ 19949 141056
10
+ 19951 214272
11
+ 19952 165376
12
+ 19955 134912
13
+ 19957 150596
14
+ 19959 176896
15
+ 19976 169472
16
+ 19979 119808
17
+ 19981 134144
18
+ 19984 171520
19
+ 19990 235008
20
+ 19998 195840
21
+ 200 125440
22
+ 20001 184576
23
+ 20005 108032
24
+ 20020 164608
25
+ 20022 235264
26
+ 20029 174080
27
+ 20038 216576
28
+ 20042 241920
29
+ 20051 203776
30
+ 20055 168448
31
+ 20062 152064
32
+ 20080 219136
33
+ 20087 116992
34
+ 20095 193792
35
+ 201 119040
36
+ 20109 167424
37
+ 20119 149760
38
+ 20120 154368
39
+ 20121 172288
40
+ 20128 143872
41
+ 20144 112128
42
+ 20147 167168
43
+ 20183 139520
exp/tts_stats_raw_phn_none/logdir/stats.15/train/stats_keys ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ feats
2
+ feats_lengths
exp/tts_stats_raw_phn_none/logdir/stats.15/train/text_shape ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 19928 71
2
+ 19931 84
3
+ 19935 117
4
+ 19938 58
5
+ 19944 71
6
+ 19946 52
7
+ 19947 71
8
+ 19948 80
9
+ 19949 64
10
+ 19951 133
11
+ 19952 102
12
+ 19955 64
13
+ 19957 81
14
+ 19959 106
15
+ 19976 97
16
+ 19979 66
17
+ 19981 66
18
+ 19984 80
19
+ 19990 127
20
+ 19998 100
21
+ 200 64
22
+ 20001 98
23
+ 20005 61
24
+ 20020 68
25
+ 20022 143
26
+ 20029 103
27
+ 20038 123
28
+ 20042 136
29
+ 20051 106
30
+ 20055 97
31
+ 20062 90
32
+ 20080 124
33
+ 20087 52
34
+ 20095 101
35
+ 201 67
36
+ 20109 82
37
+ 20119 64
38
+ 20120 93
39
+ 20121 83
40
+ 20128 67
41
+ 20144 69
42
+ 20147 78
43
+ 20183 70
exp/tts_stats_raw_phn_none/logdir/stats.15/valid/batch_keys ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ text
2
+ speech
exp/tts_stats_raw_phn_none/logdir/stats.15/valid/feats_lengths_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ab01415ef2a97eaa04e81355080bc38b3f9b0343f8e97e91044090b6ff63685
3
+ size 778
exp/tts_stats_raw_phn_none/logdir/stats.15/valid/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89cf49faa040cf392b87903167b64d4599f1d322f1a2937c91823fca48e139a9
3
+ size 1402
exp/tts_stats_raw_phn_none/logdir/stats.15/valid/speech_shape ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 19769 152064
2
+ 19771 194816
exp/tts_stats_raw_phn_none/logdir/stats.15/valid/stats_keys ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ feats
2
+ feats_lengths
exp/tts_stats_raw_phn_none/logdir/stats.15/valid/text_shape ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 19769 84
2
+ 19771 108
exp/tts_stats_raw_phn_none/logdir/stats.17.log ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.17.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.17.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.17 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
2
+ # Started at Thu Jul 13 14:10:56 UTC 2023
3
+ #
4
+ /opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
5
+ warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
6
+ /opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.17.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.17.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.17 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
7
+ [7850374a3496] 2023-07-13 14:11:04,338 (tts:293) INFO: Vocabulary size: 79
8
+ [7850374a3496] 2023-07-13 14:11:05,061 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
9
+ [7850374a3496] 2023-07-13 14:11:05,064 (abs_task:1204) INFO: Model structure:
10
+ ESPnetTTSModel(
11
+ (feats_extract): LogMelFbank(
12
+ (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
13
+ (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
14
+ )
15
+ (tts): Tacotron2(
16
+ (enc): Encoder(
17
+ (embed): Embedding(79, 512, padding_idx=0)
18
+ (convs): ModuleList(
19
+ (0-2): 3 x Sequential(
20
+ (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
21
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
22
+ (2): ReLU()
23
+ (3): Dropout(p=0.5, inplace=False)
24
+ )
25
+ )
26
+ (blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
27
+ )
28
+ (dec): Decoder(
29
+ (att): AttLoc(
30
+ (mlp_enc): Linear(in_features=512, out_features=512, bias=True)
31
+ (mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
32
+ (mlp_att): Linear(in_features=32, out_features=512, bias=False)
33
+ (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
34
+ (gvec): Linear(in_features=512, out_features=1, bias=True)
35
+ )
36
+ (lstm): ModuleList(
37
+ (0): ZoneOutCell(
38
+ (cell): LSTMCell(768, 1024)
39
+ )
40
+ (1): ZoneOutCell(
41
+ (cell): LSTMCell(1024, 1024)
42
+ )
43
+ )
44
+ (prenet): Prenet(
45
+ (prenet): ModuleList(
46
+ (0): Sequential(
47
+ (0): Linear(in_features=80, out_features=256, bias=True)
48
+ (1): ReLU()
49
+ )
50
+ (1): Sequential(
51
+ (0): Linear(in_features=256, out_features=256, bias=True)
52
+ (1): ReLU()
53
+ )
54
+ )
55
+ )
56
+ (postnet): Postnet(
57
+ (postnet): ModuleList(
58
+ (0): Sequential(
59
+ (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
60
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
61
+ (2): Tanh()
62
+ (3): Dropout(p=0.5, inplace=False)
63
+ )
64
+ (1-3): 3 x Sequential(
65
+ (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
66
+ (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
67
+ (2): Tanh()
68
+ (3): Dropout(p=0.5, inplace=False)
69
+ )
70
+ (4): Sequential(
71
+ (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
72
+ (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
73
+ (2): Dropout(p=0.5, inplace=False)
74
+ )
75
+ )
76
+ )
77
+ (feat_out): Linear(in_features=1536, out_features=240, bias=False)
78
+ (prob_out): Linear(in_features=1536, out_features=3, bias=True)
79
+ )
80
+ (taco2_loss): Tacotron2Loss(
81
+ (l1_criterion): L1Loss()
82
+ (mse_criterion): MSELoss()
83
+ (bce_criterion): BCEWithLogitsLoss()
84
+ )
85
+ (attn_loss): GuidedAttentionLoss()
86
+ )
87
+ )
88
+
89
+ Model summary:
90
+ Class Name: ESPnetTTSModel
91
+ Total Number of model parameters: 26.91 M
92
+ Number of trainable parameters: 26.91 M (100.0%)
93
+ Size: 107.63 MB
94
+ Type: torch.float32
95
+ [7850374a3496] 2023-07-13 14:11:05,064 (abs_task:1207) INFO: Optimizer:
96
+ Adam (
97
+ Parameter Group 0
98
+ amsgrad: False
99
+ betas: (0.9, 0.999)
100
+ capturable: False
101
+ differentiable: False
102
+ eps: 1e-06
103
+ foreach: None
104
+ fused: None
105
+ lr: 0.001
106
+ maximize: False
107
+ weight_decay: 0.0
108
+ )
109
+ [7850374a3496] 2023-07-13 14:11:05,064 (abs_task:1208) INFO: Scheduler: None
110
+ [7850374a3496] 2023-07-13 14:11:05,065 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.17/config.yaml
111
+ [7850374a3496] 2023-07-13 14:11:05,100 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.17', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.17.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.17.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
112
+ /opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
113
+ Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
114
+ return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined]
115
+ # Accounting: time=13 threads=1
116
+ # Ended (code 0) at Thu Jul 13 14:11:09 UTC 2023, elapsed time 13 seconds
exp/tts_stats_raw_phn_none/logdir/stats.17/config.yaml ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/finetune_tacotron2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_stats_raw_phn_none/logdir/stats.17
7
+ ngpu: 0
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: null
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: true
26
+ write_collected_feats: false
27
+ max_epoch: 120
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 5
44
+ nbest_averaging_interval: 0
45
+ grad_clip: 1.0
46
+ grad_clip_type: 2.0
47
+ grad_noise: false
48
+ accum_grad: 1
49
+ no_forward_run: false
50
+ resume: false
51
+ train_dtype: float32
52
+ use_amp: false
53
+ log_interval: null
54
+ use_matplotlib: true
55
+ use_tensorboard: true
56
+ create_graph_in_tensorboard: false
57
+ use_wandb: false
58
+ wandb_project: null
59
+ wandb_id: null
60
+ wandb_entity: null
61
+ wandb_name: null
62
+ wandb_model_log_interval: -1
63
+ detect_anomaly: false
64
+ pretrain_path: null
65
+ init_param: []
66
+ ignore_init_mismatch: false
67
+ freeze_param: []
68
+ num_iters_per_epoch: 200
69
+ batch_size: 20
70
+ valid_batch_size: null
71
+ batch_bins: 1600000
72
+ valid_batch_bins: null
73
+ train_shape_file:
74
+ - exp/tts_stats_raw_phn_none/logdir/train.17.scp
75
+ valid_shape_file:
76
+ - exp/tts_stats_raw_phn_none/logdir/valid.17.scp
77
+ batch_type: numel
78
+ valid_batch_type: null
79
+ fold_length: []
80
+ sort_in_batch: descending
81
+ sort_batch: descending
82
+ multiple_iterator: false
83
+ chunk_length: 500
84
+ chunk_shift_ratio: 0.5
85
+ num_cache_chunks: 1024
86
+ chunk_excluded_key_prefixes: []
87
+ train_data_path_and_name_and_type:
88
+ - - dump/raw/train/text
89
+ - text
90
+ - text
91
+ - - dump/raw/train/wav.scp
92
+ - speech
93
+ - sound
94
+ valid_data_path_and_name_and_type:
95
+ - - dump/raw/dev/text
96
+ - text
97
+ - text
98
+ - - dump/raw/dev/wav.scp
99
+ - speech
100
+ - sound
101
+ allow_variable_data_keys: false
102
+ max_cache_size: 0.0
103
+ max_cache_fd: 32
104
+ valid_max_cache_size: null
105
+ exclude_weight_decay: false
106
+ exclude_weight_decay_conf: {}
107
+ optim: adam
108
+ optim_conf:
109
+ lr: 0.001
110
+ eps: 1.0e-06
111
+ weight_decay: 0.0
112
+ scheduler: null
113
+ scheduler_conf: {}
114
+ token_list:
115
+ - <blank>
116
+ - <unk>
117
+ - a
118
+ - sil
119
+ - l
120
+ - aa
121
+ - m
122
+ - ii0
123
+ - t
124
+ - <
125
+ - n
126
+ - r
127
+ - E
128
+ - i0
129
+ - b
130
+ - uu0
131
+ - f
132
+ - i1
133
+ - k
134
+ - w
135
+ - A
136
+ - s
137
+ - y
138
+ - d
139
+ - q
140
+ - h
141
+ - H
142
+ - $
143
+ - u0
144
+ - AA
145
+ - j
146
+ - T
147
+ - x
148
+ - S
149
+ - z
150
+ - ll
151
+ - I1
152
+ - D
153
+ - II0
154
+ - g
155
+ - tt
156
+ - rr
157
+ - I0
158
+ - UU0
159
+ - dd
160
+ - u1
161
+ - U0
162
+ - mm
163
+ - nn
164
+ - '*'
165
+ - $$
166
+ - bb
167
+ - yy
168
+ - ss
169
+ - jj
170
+ - ww
171
+ - ^
172
+ - SS
173
+ - TT
174
+ - Z
175
+ - zz
176
+ - kk
177
+ - U1
178
+ - HH
179
+ - ff
180
+ - qq
181
+ - xx
182
+ - ^^
183
+ - DD
184
+ - hh
185
+ - EE
186
+ - ZZ
187
+ - '**'
188
+ - aaaa
189
+ - ssss
190
+ - v
191
+ - uu1
192
+ - jjjj
193
+ - <sos/eos>
194
+ odim: null
195
+ model_conf: {}
196
+ use_preprocessor: true
197
+ token_type: phn
198
+ bpemodel: null
199
+ non_linguistic_symbols: null
200
+ cleaner: null
201
+ g2p: null
202
+ feats_extract: fbank
203
+ feats_extract_conf:
204
+ n_fft: 1024
205
+ hop_length: 256
206
+ win_length: null
207
+ fs: 22050
208
+ fmin: 80
209
+ fmax: 7600
210
+ n_mels: 80
211
+ normalize: null
212
+ normalize_conf: {}
213
+ tts: tacotron2
214
+ tts_conf:
215
+ embed_dim: 512
216
+ elayers: 1
217
+ eunits: 512
218
+ econv_layers: 3
219
+ econv_chans: 512
220
+ econv_filts: 5
221
+ atype: location
222
+ adim: 512
223
+ aconv_chans: 32
224
+ aconv_filts: 15
225
+ cumulate_att_w: true
226
+ dlayers: 2
227
+ dunits: 1024
228
+ prenet_layers: 2
229
+ prenet_units: 256
230
+ postnet_layers: 5
231
+ postnet_chans: 512
232
+ postnet_filts: 5
233
+ output_activation: null
234
+ use_batch_norm: true
235
+ use_concate: true
236
+ use_residual: false
237
+ dropout_rate: 0.5
238
+ zoneout_rate: 0.1
239
+ reduction_factor: 3
240
+ spk_embed_dim: null
241
+ use_masking: true
242
+ bce_pos_weight: 20.0
243
+ use_guided_attn_loss: true
244
+ guided_attn_loss_sigma: 0.4
245
+ guided_attn_loss_lambda: 1.0
246
+ pitch_extract: null
247
+ pitch_extract_conf:
248
+ fs: 22050
249
+ n_fft: 1024
250
+ hop_length: 256
251
+ f0max: 400
252
+ f0min: 80
253
+ pitch_normalize: null
254
+ pitch_normalize_conf: {}
255
+ energy_extract: null
256
+ energy_extract_conf:
257
+ fs: 22050
258
+ n_fft: 1024
259
+ hop_length: 256
260
+ win_length: null
261
+ energy_normalize: null
262
+ energy_normalize_conf: {}
263
+ required:
264
+ - output_dir
265
+ - token_list
266
+ version: '202304'
267
+ distributed: false
exp/tts_stats_raw_phn_none/logdir/stats.17/train/batch_keys ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ text
2
+ speech
exp/tts_stats_raw_phn_none/logdir/stats.17/train/feats_lengths_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:503b8c916b78f4942fd868b9337455d0a4593217bf37efc50c5e8192e7949a22
3
+ size 778
exp/tts_stats_raw_phn_none/logdir/stats.17/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:242283537138ddfc5699bdd17945a6d6bf4a95ff1d368666989414d0b47ca626
3
+ size 1402