Spaces:

Elyadata
/

TunArTTS

Sleeping

App Files Files Community

imenLa commited on Feb 28, 2024

Commit

c045d56

verified ·

1 Parent(s): 9f2fb99

Upload 423 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

exp/tts_stats_raw_phn_none/logdir/stats.1.log +116 -0
exp/tts_stats_raw_phn_none/logdir/stats.10/config.yaml +267 -0
exp/tts_stats_raw_phn_none/logdir/stats.10/train/batch_keys +2 -0
exp/tts_stats_raw_phn_none/logdir/stats.10/train/feats_lengths_stats.npz +3 -0
exp/tts_stats_raw_phn_none/logdir/stats.10/train/feats_stats.npz +3 -0
exp/tts_stats_raw_phn_none/logdir/stats.10/train/speech_shape +43 -0
exp/tts_stats_raw_phn_none/logdir/stats.10/train/stats_keys +2 -0
exp/tts_stats_raw_phn_none/logdir/stats.10/train/text_shape +43 -0
exp/tts_stats_raw_phn_none/logdir/stats.10/valid/batch_keys +2 -0
exp/tts_stats_raw_phn_none/logdir/stats.10/valid/feats_lengths_stats.npz +3 -0
exp/tts_stats_raw_phn_none/logdir/stats.10/valid/feats_stats.npz +3 -0
exp/tts_stats_raw_phn_none/logdir/stats.10/valid/speech_shape +2 -0
exp/tts_stats_raw_phn_none/logdir/stats.10/valid/stats_keys +2 -0
exp/tts_stats_raw_phn_none/logdir/stats.10/valid/text_shape +2 -0
exp/tts_stats_raw_phn_none/logdir/stats.11.log +116 -0
exp/tts_stats_raw_phn_none/logdir/stats.12.log +116 -0
exp/tts_stats_raw_phn_none/logdir/stats.12/config.yaml +267 -0
exp/tts_stats_raw_phn_none/logdir/stats.12/train/batch_keys +2 -0
exp/tts_stats_raw_phn_none/logdir/stats.12/train/feats_lengths_stats.npz +3 -0
exp/tts_stats_raw_phn_none/logdir/stats.12/train/feats_stats.npz +3 -0
exp/tts_stats_raw_phn_none/logdir/stats.12/train/speech_shape +43 -0
exp/tts_stats_raw_phn_none/logdir/stats.12/train/stats_keys +2 -0
exp/tts_stats_raw_phn_none/logdir/stats.12/train/text_shape +43 -0
exp/tts_stats_raw_phn_none/logdir/stats.12/valid/batch_keys +2 -0
exp/tts_stats_raw_phn_none/logdir/stats.12/valid/feats_lengths_stats.npz +3 -0
exp/tts_stats_raw_phn_none/logdir/stats.12/valid/feats_stats.npz +3 -0
exp/tts_stats_raw_phn_none/logdir/stats.12/valid/speech_shape +2 -0
exp/tts_stats_raw_phn_none/logdir/stats.12/valid/stats_keys +2 -0
exp/tts_stats_raw_phn_none/logdir/stats.12/valid/text_shape +2 -0
exp/tts_stats_raw_phn_none/logdir/stats.13.log +116 -0
exp/tts_stats_raw_phn_none/logdir/stats.14.log +116 -0
exp/tts_stats_raw_phn_none/logdir/stats.15.log +116 -0
exp/tts_stats_raw_phn_none/logdir/stats.15/config.yaml +267 -0
exp/tts_stats_raw_phn_none/logdir/stats.15/train/batch_keys +2 -0
exp/tts_stats_raw_phn_none/logdir/stats.15/train/feats_lengths_stats.npz +3 -0
exp/tts_stats_raw_phn_none/logdir/stats.15/train/feats_stats.npz +3 -0
exp/tts_stats_raw_phn_none/logdir/stats.15/train/speech_shape +43 -0
exp/tts_stats_raw_phn_none/logdir/stats.15/train/stats_keys +2 -0
exp/tts_stats_raw_phn_none/logdir/stats.15/train/text_shape +43 -0
exp/tts_stats_raw_phn_none/logdir/stats.15/valid/batch_keys +2 -0
exp/tts_stats_raw_phn_none/logdir/stats.15/valid/feats_lengths_stats.npz +3 -0
exp/tts_stats_raw_phn_none/logdir/stats.15/valid/feats_stats.npz +3 -0
exp/tts_stats_raw_phn_none/logdir/stats.15/valid/speech_shape +2 -0
exp/tts_stats_raw_phn_none/logdir/stats.15/valid/stats_keys +2 -0
exp/tts_stats_raw_phn_none/logdir/stats.15/valid/text_shape +2 -0
exp/tts_stats_raw_phn_none/logdir/stats.17.log +116 -0
exp/tts_stats_raw_phn_none/logdir/stats.17/config.yaml +267 -0
exp/tts_stats_raw_phn_none/logdir/stats.17/train/batch_keys +2 -0
exp/tts_stats_raw_phn_none/logdir/stats.17/train/feats_lengths_stats.npz +3 -0
exp/tts_stats_raw_phn_none/logdir/stats.17/train/feats_stats.npz +3 -0

exp/tts_stats_raw_phn_none/logdir/stats.1.log ADDED Viewed

	@@ -0,0 +1,116 @@

+# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.1.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.1.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.1 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
+# Started at Thu Jul 13 14:09:11 UTC 2023
+#
+/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
+  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
+/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.1.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.1.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.1 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
+[7850374a3496] 2023-07-13 14:09:21,971 (tts:293) INFO: Vocabulary size: 79
+[7850374a3496] 2023-07-13 14:09:22,770 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
+[7850374a3496] 2023-07-13 14:09:22,773 (abs_task:1204) INFO: Model structure:
+ESPnetTTSModel(
+  (feats_extract): LogMelFbank(
+    (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
+    (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
+  )
+  (tts): Tacotron2(
+    (enc): Encoder(
+      (embed): Embedding(79, 512, padding_idx=0)
+      (convs): ModuleList(
+        (0-2): 3 x Sequential(
+          (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+          (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          (2): ReLU()
+          (3): Dropout(p=0.5, inplace=False)
+        )
+      )
+      (blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
+    )
+    (dec): Decoder(
+      (att): AttLoc(
+        (mlp_enc): Linear(in_features=512, out_features=512, bias=True)
+        (mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
+        (mlp_att): Linear(in_features=32, out_features=512, bias=False)
+        (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
+        (gvec): Linear(in_features=512, out_features=1, bias=True)
+      )
+      (lstm): ModuleList(
+        (0): ZoneOutCell(
+          (cell): LSTMCell(768, 1024)
+        )
+        (1): ZoneOutCell(
+          (cell): LSTMCell(1024, 1024)
+        )
+      )
+      (prenet): Prenet(
+        (prenet): ModuleList(
+          (0): Sequential(
+            (0): Linear(in_features=80, out_features=256, bias=True)
+            (1): ReLU()
+          )
+          (1): Sequential(
+            (0): Linear(in_features=256, out_features=256, bias=True)
+            (1): ReLU()
+          )
+        )
+      )
+      (postnet): Postnet(
+        (postnet): ModuleList(
+          (0): Sequential(
+            (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Tanh()
+            (3): Dropout(p=0.5, inplace=False)
+          )
+          (1-3): 3 x Sequential(
+            (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Tanh()
+            (3): Dropout(p=0.5, inplace=False)
+          )
+          (4): Sequential(
+            (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Dropout(p=0.5, inplace=False)
+          )
+        )
+      )
+      (feat_out): Linear(in_features=1536, out_features=240, bias=False)
+      (prob_out): Linear(in_features=1536, out_features=3, bias=True)
+    )
+    (taco2_loss): Tacotron2Loss(
+      (l1_criterion): L1Loss()
+      (mse_criterion): MSELoss()
+      (bce_criterion): BCEWithLogitsLoss()
+    )
+    (attn_loss): GuidedAttentionLoss()
+  )
+)
+Model summary:
+    Class Name: ESPnetTTSModel
+    Total Number of model parameters: 26.91 M
+    Number of trainable parameters: 26.91 M (100.0%)
+    Size: 107.63 MB
+    Type: torch.float32
+[7850374a3496] 2023-07-13 14:09:22,773 (abs_task:1207) INFO: Optimizer:
+Adam (
+Parameter Group 0
+    amsgrad: False
+    betas: (0.9, 0.999)
+    capturable: False
+    differentiable: False
+    eps: 1e-06
+    foreach: None
+    fused: None
+    lr: 0.001
+    maximize: False
+    weight_decay: 0.0
+)
+[7850374a3496] 2023-07-13 14:09:22,773 (abs_task:1208) INFO: Scheduler: None
+[7850374a3496] 2023-07-13 14:09:22,773 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.1/config.yaml
+[7850374a3496] 2023-07-13 14:09:22,799 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.1', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.1.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.1.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
+/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
+Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
+  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]
+# Accounting: time=16 threads=1
+# Ended (code 0) at Thu Jul 13 14:09:27 UTC 2023, elapsed time 16 seconds

exp/tts_stats_raw_phn_none/logdir/stats.10/config.yaml ADDED Viewed

	@@ -0,0 +1,267 @@

+config: conf/tuning/finetune_tacotron2.yaml
+print_config: false
+log_level: INFO
+dry_run: false
+iterator_type: sequence
+output_dir: exp/tts_stats_raw_phn_none/logdir/stats.10
+ngpu: 0
+seed: 0
+num_workers: 1
+num_att_plot: 3
+dist_backend: nccl
+dist_init_method: env://
+dist_world_size: null
+dist_rank: null
+local_rank: null
+dist_master_addr: null
+dist_master_port: null
+dist_launcher: null
+multiprocessing_distributed: false
+unused_parameters: false
+sharded_ddp: false
+cudnn_enabled: true
+cudnn_benchmark: false
+cudnn_deterministic: true
+collect_stats: true
+write_collected_feats: false
+max_epoch: 120
+patience: null
+val_scheduler_criterion:
+- valid
+- loss
+early_stopping_criterion:
+- valid
+- loss
+- min
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
+keep_nbest_models: 5
+nbest_averaging_interval: 0
+grad_clip: 1.0
+grad_clip_type: 2.0
+grad_noise: false
+accum_grad: 1
+no_forward_run: false
+resume: false
+train_dtype: float32
+use_amp: false
+log_interval: null
+use_matplotlib: true
+use_tensorboard: true
+create_graph_in_tensorboard: false
+use_wandb: false
+wandb_project: null
+wandb_id: null
+wandb_entity: null
+wandb_name: null
+wandb_model_log_interval: -1
+detect_anomaly: false
+pretrain_path: null
+init_param: []
+ignore_init_mismatch: false
+freeze_param: []
+num_iters_per_epoch: 200
+batch_size: 20
+valid_batch_size: null
+batch_bins: 1600000
+valid_batch_bins: null
+train_shape_file:
+- exp/tts_stats_raw_phn_none/logdir/train.10.scp
+valid_shape_file:
+- exp/tts_stats_raw_phn_none/logdir/valid.10.scp
+batch_type: numel
+valid_batch_type: null
+fold_length: []
+sort_in_batch: descending
+sort_batch: descending
+multiple_iterator: false
+chunk_length: 500
+chunk_shift_ratio: 0.5
+num_cache_chunks: 1024
+chunk_excluded_key_prefixes: []
+train_data_path_and_name_and_type:
+-   - dump/raw/train/text
+    - text
+    - text
+-   - dump/raw/train/wav.scp
+    - speech
+    - sound
+valid_data_path_and_name_and_type:
+-   - dump/raw/dev/text
+    - text
+    - text
+-   - dump/raw/dev/wav.scp
+    - speech
+    - sound
+allow_variable_data_keys: false
+max_cache_size: 0.0
+max_cache_fd: 32
+valid_max_cache_size: null
+exclude_weight_decay: false
+exclude_weight_decay_conf: {}
+optim: adam
+optim_conf:
+    lr: 0.001
+    eps: 1.0e-06
+    weight_decay: 0.0
+scheduler: null
+scheduler_conf: {}
+token_list:
+- <blank>
+- <unk>
+- a
+- sil
+- l
+- aa
+- m
+- ii0
+- t
+- <
+- n
+- r
+- E
+- i0
+- b
+- uu0
+- f
+- i1
+- k
+- w
+- A
+- s
+- y
+- d
+- q
+- h
+- H
+- $
+- u0
+- AA
+- j
+- T
+- x
+- S
+- z
+- ll
+- I1
+- D
+- II0
+- g
+- tt
+- rr
+- I0
+- UU0
+- dd
+- u1
+- U0
+- mm
+- nn
+- '*'
+- $$
+- bb
+- yy
+- ss
+- jj
+- ww
+- ^
+- SS
+- TT
+- Z
+- zz
+- kk
+- U1
+- HH
+- ff
+- qq
+- xx
+- ^^
+- DD
+- hh
+- EE
+- ZZ
+- '**'
+- aaaa
+- ssss
+- v
+- uu1
+- jjjj
+- <sos/eos>
+odim: null
+model_conf: {}
+use_preprocessor: true
+token_type: phn
+bpemodel: null
+non_linguistic_symbols: null
+cleaner: null
+g2p: null
+feats_extract: fbank
+feats_extract_conf:
+    n_fft: 1024
+    hop_length: 256
+    win_length: null
+    fs: 22050
+    fmin: 80
+    fmax: 7600
+    n_mels: 80
+normalize: null
+normalize_conf: {}
+tts: tacotron2
+tts_conf:
+    embed_dim: 512
+    elayers: 1
+    eunits: 512
+    econv_layers: 3
+    econv_chans: 512
+    econv_filts: 5
+    atype: location
+    adim: 512
+    aconv_chans: 32
+    aconv_filts: 15
+    cumulate_att_w: true
+    dlayers: 2
+    dunits: 1024
+    prenet_layers: 2
+    prenet_units: 256
+    postnet_layers: 5
+    postnet_chans: 512
+    postnet_filts: 5
+    output_activation: null
+    use_batch_norm: true
+    use_concate: true
+    use_residual: false
+    dropout_rate: 0.5
+    zoneout_rate: 0.1
+    reduction_factor: 3
+    spk_embed_dim: null
+    use_masking: true
+    bce_pos_weight: 20.0
+    use_guided_attn_loss: true
+    guided_attn_loss_sigma: 0.4
+    guided_attn_loss_lambda: 1.0
+pitch_extract: null
+pitch_extract_conf:
+    fs: 22050
+    n_fft: 1024
+    hop_length: 256
+    f0max: 400
+    f0min: 80
+pitch_normalize: null
+pitch_normalize_conf: {}
+energy_extract: null
+energy_extract_conf:
+    fs: 22050
+    n_fft: 1024
+    hop_length: 256
+    win_length: null
+energy_normalize: null
+energy_normalize_conf: {}
+required:
+- output_dir
+- token_list
+version: '202304'
+distributed: false

exp/tts_stats_raw_phn_none/logdir/stats.10/train/batch_keys ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ text
2	+ speech

exp/tts_stats_raw_phn_none/logdir/stats.10/train/feats_lengths_stats.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d6b9f2fd6232f4b0ca33457b5d22c02d2b17b34d24e2f9f1f2415b0ec8a15f0
+size 778

exp/tts_stats_raw_phn_none/logdir/stats.10/train/feats_stats.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:057d10f13786abd5b7b6b90bea854b18ad227d34f19bb8092c488f864880dd51
+size 1402

exp/tts_stats_raw_phn_none/logdir/stats.10/train/speech_shape ADDED Viewed

	@@ -0,0 +1,43 @@

+18935 142336
+18936 141568
+18943 175360
+18944 173824
+18947 190208
+18951 154368
+18955 233216
+18959 226560
+18964 163584
+18982 113664
+18989 163072
+18991 212480
+18993 175872
+18997 101888
+19 122880
+19001 217088
+19005 184832
+19010 156928
+19011 175872
+19015 139520
+19024 165888
+19028 158720
+19063 187136
+19065 144128
+19067 175616
+19075 163584
+19076 214784
+19090 172544
+19091 199936
+19095 118016
+19096 165888
+19099 159488
+191 134144
+19103 124416
+19109 132352
+19111 151740
+19113 129280
+19116 155648
+19118 174336
+19121 137472
+19122 144896
+19132 131072
+19138 135936

exp/tts_stats_raw_phn_none/logdir/stats.10/train/stats_keys ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ feats
2	+ feats_lengths

exp/tts_stats_raw_phn_none/logdir/stats.10/train/text_shape ADDED Viewed

	@@ -0,0 +1,43 @@

+18935 66
+18936 77
+18943 94
+18944 90
+18947 93
+18951 66
+18955 116
+18959 120
+18964 81
+18982 54
+18989 85
+18991 114
+18993 100
+18997 45
+19 58
+19001 132
+19005 97
+19010 82
+19011 97
+19015 72
+19024 90
+19028 71
+19063 115
+19065 84
+19067 83
+19075 78
+19076 112
+19090 92
+19091 108
+19095 62
+19096 89
+19099 87
+191 70
+19103 68
+19109 75
+19111 80
+19113 45
+19116 87
+19118 97
+19121 74
+19122 87
+19132 69
+19138 75

exp/tts_stats_raw_phn_none/logdir/stats.10/valid/batch_keys ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ text
2	+ speech

exp/tts_stats_raw_phn_none/logdir/stats.10/valid/feats_lengths_stats.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fe87f28f6100dafb92cda513225e57bd983e4483dbefd895ad65790398958c0
+size 778

exp/tts_stats_raw_phn_none/logdir/stats.10/valid/feats_stats.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c23bf05ba35b7d316b51347290281e31e36aca870887098c995fd8f5c860508
+size 1402

exp/tts_stats_raw_phn_none/logdir/stats.10/valid/speech_shape ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 169 189952
2	+ 18237 234496

exp/tts_stats_raw_phn_none/logdir/stats.10/valid/stats_keys ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ feats
2	+ feats_lengths

exp/tts_stats_raw_phn_none/logdir/stats.10/valid/text_shape ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 169 104
2	+ 18237 134

exp/tts_stats_raw_phn_none/logdir/stats.11.log ADDED Viewed

	@@ -0,0 +1,116 @@

+# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.11.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.11.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.11 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
+# Started at Thu Jul 13 14:10:19 UTC 2023
+#
+/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
+  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
+/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.11.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.11.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.11 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
+[7850374a3496] 2023-07-13 14:10:27,026 (tts:293) INFO: Vocabulary size: 79
+[7850374a3496] 2023-07-13 14:10:27,731 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
+[7850374a3496] 2023-07-13 14:10:27,734 (abs_task:1204) INFO: Model structure:
+ESPnetTTSModel(
+  (feats_extract): LogMelFbank(
+    (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
+    (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
+  )
+  (tts): Tacotron2(
+    (enc): Encoder(
+      (embed): Embedding(79, 512, padding_idx=0)
+      (convs): ModuleList(
+        (0-2): 3 x Sequential(
+          (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+          (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          (2): ReLU()
+          (3): Dropout(p=0.5, inplace=False)
+        )
+      )
+      (blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
+    )
+    (dec): Decoder(
+      (att): AttLoc(
+        (mlp_enc): Linear(in_features=512, out_features=512, bias=True)
+        (mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
+        (mlp_att): Linear(in_features=32, out_features=512, bias=False)
+        (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
+        (gvec): Linear(in_features=512, out_features=1, bias=True)
+      )
+      (lstm): ModuleList(
+        (0): ZoneOutCell(
+          (cell): LSTMCell(768, 1024)
+        )
+        (1): ZoneOutCell(
+          (cell): LSTMCell(1024, 1024)
+        )
+      )
+      (prenet): Prenet(
+        (prenet): ModuleList(
+          (0): Sequential(
+            (0): Linear(in_features=80, out_features=256, bias=True)
+            (1): ReLU()
+          )
+          (1): Sequential(
+            (0): Linear(in_features=256, out_features=256, bias=True)
+            (1): ReLU()
+          )
+        )
+      )
+      (postnet): Postnet(
+        (postnet): ModuleList(
+          (0): Sequential(
+            (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Tanh()
+            (3): Dropout(p=0.5, inplace=False)
+          )
+          (1-3): 3 x Sequential(
+            (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Tanh()
+            (3): Dropout(p=0.5, inplace=False)
+          )
+          (4): Sequential(
+            (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Dropout(p=0.5, inplace=False)
+          )
+        )
+      )
+      (feat_out): Linear(in_features=1536, out_features=240, bias=False)
+      (prob_out): Linear(in_features=1536, out_features=3, bias=True)
+    )
+    (taco2_loss): Tacotron2Loss(
+      (l1_criterion): L1Loss()
+      (mse_criterion): MSELoss()
+      (bce_criterion): BCEWithLogitsLoss()
+    )
+    (attn_loss): GuidedAttentionLoss()
+  )
+)
+Model summary:
+    Class Name: ESPnetTTSModel
+    Total Number of model parameters: 26.91 M
+    Number of trainable parameters: 26.91 M (100.0%)
+    Size: 107.63 MB
+    Type: torch.float32
+[7850374a3496] 2023-07-13 14:10:27,734 (abs_task:1207) INFO: Optimizer:
+Adam (
+Parameter Group 0
+    amsgrad: False
+    betas: (0.9, 0.999)
+    capturable: False
+    differentiable: False
+    eps: 1e-06
+    foreach: None
+    fused: None
+    lr: 0.001
+    maximize: False
+    weight_decay: 0.0
+)
+[7850374a3496] 2023-07-13 14:10:27,734 (abs_task:1208) INFO: Scheduler: None
+[7850374a3496] 2023-07-13 14:10:27,734 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.11/config.yaml
+[7850374a3496] 2023-07-13 14:10:27,761 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.11', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.11.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.11.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
+/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
+Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
+  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]
+# Accounting: time=12 threads=1
+# Ended (code 0) at Thu Jul 13 14:10:31 UTC 2023, elapsed time 12 seconds

exp/tts_stats_raw_phn_none/logdir/stats.12.log ADDED Viewed

	@@ -0,0 +1,116 @@

+# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.12.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.12.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.12 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
+# Started at Thu Jul 13 14:10:19 UTC 2023
+#
+/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
+  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
+/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.12.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.12.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.12 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
+[7850374a3496] 2023-07-13 14:10:27,287 (tts:293) INFO: Vocabulary size: 79
+[7850374a3496] 2023-07-13 14:10:27,998 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
+[7850374a3496] 2023-07-13 14:10:28,001 (abs_task:1204) INFO: Model structure:
+ESPnetTTSModel(
+  (feats_extract): LogMelFbank(
+    (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
+    (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
+  )
+  (tts): Tacotron2(
+    (enc): Encoder(
+      (embed): Embedding(79, 512, padding_idx=0)
+      (convs): ModuleList(
+        (0-2): 3 x Sequential(
+          (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+          (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          (2): ReLU()
+          (3): Dropout(p=0.5, inplace=False)
+        )
+      )
+      (blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
+    )
+    (dec): Decoder(
+      (att): AttLoc(
+        (mlp_enc): Linear(in_features=512, out_features=512, bias=True)
+        (mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
+        (mlp_att): Linear(in_features=32, out_features=512, bias=False)
+        (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
+        (gvec): Linear(in_features=512, out_features=1, bias=True)
+      )
+      (lstm): ModuleList(
+        (0): ZoneOutCell(
+          (cell): LSTMCell(768, 1024)
+        )
+        (1): ZoneOutCell(
+          (cell): LSTMCell(1024, 1024)
+        )
+      )
+      (prenet): Prenet(
+        (prenet): ModuleList(
+          (0): Sequential(
+            (0): Linear(in_features=80, out_features=256, bias=True)
+            (1): ReLU()
+          )
+          (1): Sequential(
+            (0): Linear(in_features=256, out_features=256, bias=True)
+            (1): ReLU()
+          )
+        )
+      )
+      (postnet): Postnet(
+        (postnet): ModuleList(
+          (0): Sequential(
+            (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Tanh()
+            (3): Dropout(p=0.5, inplace=False)
+          )
+          (1-3): 3 x Sequential(
+            (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Tanh()
+            (3): Dropout(p=0.5, inplace=False)
+          )
+          (4): Sequential(
+            (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Dropout(p=0.5, inplace=False)
+          )
+        )
+      )
+      (feat_out): Linear(in_features=1536, out_features=240, bias=False)
+      (prob_out): Linear(in_features=1536, out_features=3, bias=True)
+    )
+    (taco2_loss): Tacotron2Loss(
+      (l1_criterion): L1Loss()
+      (mse_criterion): MSELoss()
+      (bce_criterion): BCEWithLogitsLoss()
+    )
+    (attn_loss): GuidedAttentionLoss()
+  )
+)
+Model summary:
+    Class Name: ESPnetTTSModel
+    Total Number of model parameters: 26.91 M
+    Number of trainable parameters: 26.91 M (100.0%)
+    Size: 107.63 MB
+    Type: torch.float32
+[7850374a3496] 2023-07-13 14:10:28,001 (abs_task:1207) INFO: Optimizer:
+Adam (
+Parameter Group 0
+    amsgrad: False
+    betas: (0.9, 0.999)
+    capturable: False
+    differentiable: False
+    eps: 1e-06
+    foreach: None
+    fused: None
+    lr: 0.001
+    maximize: False
+    weight_decay: 0.0
+)
+[7850374a3496] 2023-07-13 14:10:28,001 (abs_task:1208) INFO: Scheduler: None
+[7850374a3496] 2023-07-13 14:10:28,001 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.12/config.yaml
+[7850374a3496] 2023-07-13 14:10:28,024 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.12', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.12.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.12.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
+/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
+Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
+  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]
+# Accounting: time=13 threads=1
+# Ended (code 0) at Thu Jul 13 14:10:32 UTC 2023, elapsed time 13 seconds

exp/tts_stats_raw_phn_none/logdir/stats.12/config.yaml ADDED Viewed

	@@ -0,0 +1,267 @@

+config: conf/tuning/finetune_tacotron2.yaml
+print_config: false
+log_level: INFO
+dry_run: false
+iterator_type: sequence
+output_dir: exp/tts_stats_raw_phn_none/logdir/stats.12
+ngpu: 0
+seed: 0
+num_workers: 1
+num_att_plot: 3
+dist_backend: nccl
+dist_init_method: env://
+dist_world_size: null
+dist_rank: null
+local_rank: null
+dist_master_addr: null
+dist_master_port: null
+dist_launcher: null
+multiprocessing_distributed: false
+unused_parameters: false
+sharded_ddp: false
+cudnn_enabled: true
+cudnn_benchmark: false
+cudnn_deterministic: true
+collect_stats: true
+write_collected_feats: false
+max_epoch: 120
+patience: null
+val_scheduler_criterion:
+- valid
+- loss
+early_stopping_criterion:
+- valid
+- loss
+- min
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
+keep_nbest_models: 5
+nbest_averaging_interval: 0
+grad_clip: 1.0
+grad_clip_type: 2.0
+grad_noise: false
+accum_grad: 1
+no_forward_run: false
+resume: false
+train_dtype: float32
+use_amp: false
+log_interval: null
+use_matplotlib: true
+use_tensorboard: true
+create_graph_in_tensorboard: false
+use_wandb: false
+wandb_project: null
+wandb_id: null
+wandb_entity: null
+wandb_name: null
+wandb_model_log_interval: -1
+detect_anomaly: false
+pretrain_path: null
+init_param: []
+ignore_init_mismatch: false
+freeze_param: []
+num_iters_per_epoch: 200
+batch_size: 20
+valid_batch_size: null
+batch_bins: 1600000
+valid_batch_bins: null
+train_shape_file:
+- exp/tts_stats_raw_phn_none/logdir/train.12.scp
+valid_shape_file:
+- exp/tts_stats_raw_phn_none/logdir/valid.12.scp
+batch_type: numel
+valid_batch_type: null
+fold_length: []
+sort_in_batch: descending
+sort_batch: descending
+multiple_iterator: false
+chunk_length: 500
+chunk_shift_ratio: 0.5
+num_cache_chunks: 1024
+chunk_excluded_key_prefixes: []
+train_data_path_and_name_and_type:
+-   - dump/raw/train/text
+    - text
+    - text
+-   - dump/raw/train/wav.scp
+    - speech
+    - sound
+valid_data_path_and_name_and_type:
+-   - dump/raw/dev/text
+    - text
+    - text
+-   - dump/raw/dev/wav.scp
+    - speech
+    - sound
+allow_variable_data_keys: false
+max_cache_size: 0.0
+max_cache_fd: 32
+valid_max_cache_size: null
+exclude_weight_decay: false
+exclude_weight_decay_conf: {}
+optim: adam
+optim_conf:
+    lr: 0.001
+    eps: 1.0e-06
+    weight_decay: 0.0
+scheduler: null
+scheduler_conf: {}
+token_list:
+- <blank>
+- <unk>
+- a
+- sil
+- l
+- aa
+- m
+- ii0
+- t
+- <
+- n
+- r
+- E
+- i0
+- b
+- uu0
+- f
+- i1
+- k
+- w
+- A
+- s
+- y
+- d
+- q
+- h
+- H
+- $
+- u0
+- AA
+- j
+- T
+- x
+- S
+- z
+- ll
+- I1
+- D
+- II0
+- g
+- tt
+- rr
+- I0
+- UU0
+- dd
+- u1
+- U0
+- mm
+- nn
+- '*'
+- $$
+- bb
+- yy
+- ss
+- jj
+- ww
+- ^
+- SS
+- TT
+- Z
+- zz
+- kk
+- U1
+- HH
+- ff
+- qq
+- xx
+- ^^
+- DD
+- hh
+- EE
+- ZZ
+- '**'
+- aaaa
+- ssss
+- v
+- uu1
+- jjjj
+- <sos/eos>
+odim: null
+model_conf: {}
+use_preprocessor: true
+token_type: phn
+bpemodel: null
+non_linguistic_symbols: null
+cleaner: null
+g2p: null
+feats_extract: fbank
+feats_extract_conf:
+    n_fft: 1024
+    hop_length: 256
+    win_length: null
+    fs: 22050
+    fmin: 80
+    fmax: 7600
+    n_mels: 80
+normalize: null
+normalize_conf: {}
+tts: tacotron2
+tts_conf:
+    embed_dim: 512
+    elayers: 1
+    eunits: 512
+    econv_layers: 3
+    econv_chans: 512
+    econv_filts: 5
+    atype: location
+    adim: 512
+    aconv_chans: 32
+    aconv_filts: 15
+    cumulate_att_w: true
+    dlayers: 2
+    dunits: 1024
+    prenet_layers: 2
+    prenet_units: 256
+    postnet_layers: 5
+    postnet_chans: 512
+    postnet_filts: 5
+    output_activation: null
+    use_batch_norm: true
+    use_concate: true
+    use_residual: false
+    dropout_rate: 0.5
+    zoneout_rate: 0.1
+    reduction_factor: 3
+    spk_embed_dim: null
+    use_masking: true
+    bce_pos_weight: 20.0
+    use_guided_attn_loss: true
+    guided_attn_loss_sigma: 0.4
+    guided_attn_loss_lambda: 1.0
+pitch_extract: null
+pitch_extract_conf:
+    fs: 22050
+    n_fft: 1024
+    hop_length: 256
+    f0max: 400
+    f0min: 80
+pitch_normalize: null
+pitch_normalize_conf: {}
+energy_extract: null
+energy_extract_conf:
+    fs: 22050
+    n_fft: 1024
+    hop_length: 256
+    win_length: null
+energy_normalize: null
+energy_normalize_conf: {}
+required:
+- output_dir
+- token_list
+version: '202304'
+distributed: false

exp/tts_stats_raw_phn_none/logdir/stats.12/train/batch_keys ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ text
2	+ speech

exp/tts_stats_raw_phn_none/logdir/stats.12/train/feats_lengths_stats.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d49bfea4033ce1e51b1a17d023326b9c8fc5b58658ad92a4ab13fae6f7b8d624
+size 778

exp/tts_stats_raw_phn_none/logdir/stats.12/train/feats_stats.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75c6bcb5409152fe06dfbb367a0d796774c6f1e94af5fa448137f8a901f9c284
+size 1402

exp/tts_stats_raw_phn_none/logdir/stats.12/train/speech_shape ADDED Viewed

	@@ -0,0 +1,43 @@

+19360 171776
+19366 143104
+19367 199936
+19371 145920
+19372 162816
+19374 145664
+19376 201682
+19387 219904
+19396 130048
+19399 112896
+194 140032
+19400 183808
+19404 159488
+19406 186624
+19410 183552
+19413 121088
+19414 134912
+19423 198400
+19429 195328
+19439 114944
+19440 97280
+19449 159488
+19451 140032
+19454 120320
+19477 191488
+19482 157696
+19488 169472
+19496 129792
+19499 153344
+195 122624
+19501 137216
+19506 162816
+19509 143872
+19510 119040
+19511 146688
+19521 132864
+19522 167680
+19524 146944
+19529 188928
+19540 193536
+19542 179456
+19543 159669
+19548 138752

exp/tts_stats_raw_phn_none/logdir/stats.12/train/stats_keys ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ feats
2	+ feats_lengths

exp/tts_stats_raw_phn_none/logdir/stats.12/train/text_shape ADDED Viewed

	@@ -0,0 +1,43 @@

+19360 89
+19366 85
+19367 92
+19371 64
+19372 84
+19374 75
+19376 103
+19387 108
+19396 57
+19399 54
+194 62
+19400 92
+19404 99
+19406 103
+19410 102
+19413 67
+19414 60
+19423 90
+19429 90
+19439 51
+19440 33
+19449 92
+19451 67
+19454 61
+19477 97
+19482 84
+19488 93
+19496 63
+19499 65
+195 68
+19501 60
+19506 96
+19509 80
+19510 57
+19511 77
+19521 61
+19522 80
+19524 67
+19529 91
+19540 101
+19542 107
+19543 78
+19548 61

exp/tts_stats_raw_phn_none/logdir/stats.12/valid/batch_keys ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ text
2	+ speech

exp/tts_stats_raw_phn_none/logdir/stats.12/valid/feats_lengths_stats.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f504865400934158e457d1520e847de7701d6bd8479c772a7d9710d35616c234
+size 778

exp/tts_stats_raw_phn_none/logdir/stats.12/valid/feats_stats.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51eb4c28274a7e81976f604f330f3c2f10cc6cf6a8befcf261e3e49cbdd44ab0
+size 1402

exp/tts_stats_raw_phn_none/logdir/stats.12/valid/speech_shape ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 18963 129280
2	+ 19178 177408

exp/tts_stats_raw_phn_none/logdir/stats.12/valid/stats_keys ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ feats
2	+ feats_lengths

exp/tts_stats_raw_phn_none/logdir/stats.12/valid/text_shape ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 18963 58
2	+ 19178 91

exp/tts_stats_raw_phn_none/logdir/stats.13.log ADDED Viewed

	@@ -0,0 +1,116 @@

+# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.13.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.13.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.13 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
+# Started at Thu Jul 13 14:10:31 UTC 2023
+#
+/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
+  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
+/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.13.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.13.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.13 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
+[7850374a3496] 2023-07-13 14:10:40,144 (tts:293) INFO: Vocabulary size: 79
+[7850374a3496] 2023-07-13 14:10:40,861 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
+[7850374a3496] 2023-07-13 14:10:40,864 (abs_task:1204) INFO: Model structure:
+ESPnetTTSModel(
+  (feats_extract): LogMelFbank(
+    (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
+    (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
+  )
+  (tts): Tacotron2(
+    (enc): Encoder(
+      (embed): Embedding(79, 512, padding_idx=0)
+      (convs): ModuleList(
+        (0-2): 3 x Sequential(
+          (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+          (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          (2): ReLU()
+          (3): Dropout(p=0.5, inplace=False)
+        )
+      )
+      (blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
+    )
+    (dec): Decoder(
+      (att): AttLoc(
+        (mlp_enc): Linear(in_features=512, out_features=512, bias=True)
+        (mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
+        (mlp_att): Linear(in_features=32, out_features=512, bias=False)
+        (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
+        (gvec): Linear(in_features=512, out_features=1, bias=True)
+      )
+      (lstm): ModuleList(
+        (0): ZoneOutCell(
+          (cell): LSTMCell(768, 1024)
+        )
+        (1): ZoneOutCell(
+          (cell): LSTMCell(1024, 1024)
+        )
+      )
+      (prenet): Prenet(
+        (prenet): ModuleList(
+          (0): Sequential(
+            (0): Linear(in_features=80, out_features=256, bias=True)
+            (1): ReLU()
+          )
+          (1): Sequential(
+            (0): Linear(in_features=256, out_features=256, bias=True)
+            (1): ReLU()
+          )
+        )
+      )
+      (postnet): Postnet(
+        (postnet): ModuleList(
+          (0): Sequential(
+            (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Tanh()
+            (3): Dropout(p=0.5, inplace=False)
+          )
+          (1-3): 3 x Sequential(
+            (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Tanh()
+            (3): Dropout(p=0.5, inplace=False)
+          )
+          (4): Sequential(
+            (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Dropout(p=0.5, inplace=False)
+          )
+        )
+      )
+      (feat_out): Linear(in_features=1536, out_features=240, bias=False)
+      (prob_out): Linear(in_features=1536, out_features=3, bias=True)
+    )
+    (taco2_loss): Tacotron2Loss(
+      (l1_criterion): L1Loss()
+      (mse_criterion): MSELoss()
+      (bce_criterion): BCEWithLogitsLoss()
+    )
+    (attn_loss): GuidedAttentionLoss()
+  )
+)
+Model summary:
+    Class Name: ESPnetTTSModel
+    Total Number of model parameters: 26.91 M
+    Number of trainable parameters: 26.91 M (100.0%)
+    Size: 107.63 MB
+    Type: torch.float32
+[7850374a3496] 2023-07-13 14:10:40,864 (abs_task:1207) INFO: Optimizer:
+Adam (
+Parameter Group 0
+    amsgrad: False
+    betas: (0.9, 0.999)
+    capturable: False
+    differentiable: False
+    eps: 1e-06
+    foreach: None
+    fused: None
+    lr: 0.001
+    maximize: False
+    weight_decay: 0.0
+)
+[7850374a3496] 2023-07-13 14:10:40,864 (abs_task:1208) INFO: Scheduler: None
+[7850374a3496] 2023-07-13 14:10:40,864 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.13/config.yaml
+[7850374a3496] 2023-07-13 14:10:40,891 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.13', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.13.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.13.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
+/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
+Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
+  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]
+# Accounting: time=13 threads=1
+# Ended (code 0) at Thu Jul 13 14:10:44 UTC 2023, elapsed time 13 seconds

exp/tts_stats_raw_phn_none/logdir/stats.14.log ADDED Viewed

	@@ -0,0 +1,116 @@

+# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.14.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.14.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.14 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
+# Started at Thu Jul 13 14:10:32 UTC 2023
+#
+/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
+  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
+/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.14.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.14.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.14 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
+[7850374a3496] 2023-07-13 14:10:40,319 (tts:293) INFO: Vocabulary size: 79
+[7850374a3496] 2023-07-13 14:10:41,034 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
+[7850374a3496] 2023-07-13 14:10:41,037 (abs_task:1204) INFO: Model structure:
+ESPnetTTSModel(
+  (feats_extract): LogMelFbank(
+    (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
+    (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
+  )
+  (tts): Tacotron2(
+    (enc): Encoder(
+      (embed): Embedding(79, 512, padding_idx=0)
+      (convs): ModuleList(
+        (0-2): 3 x Sequential(
+          (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+          (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          (2): ReLU()
+          (3): Dropout(p=0.5, inplace=False)
+        )
+      )
+      (blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
+    )
+    (dec): Decoder(
+      (att): AttLoc(
+        (mlp_enc): Linear(in_features=512, out_features=512, bias=True)
+        (mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
+        (mlp_att): Linear(in_features=32, out_features=512, bias=False)
+        (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
+        (gvec): Linear(in_features=512, out_features=1, bias=True)
+      )
+      (lstm): ModuleList(
+        (0): ZoneOutCell(
+          (cell): LSTMCell(768, 1024)
+        )
+        (1): ZoneOutCell(
+          (cell): LSTMCell(1024, 1024)
+        )
+      )
+      (prenet): Prenet(
+        (prenet): ModuleList(
+          (0): Sequential(
+            (0): Linear(in_features=80, out_features=256, bias=True)
+            (1): ReLU()
+          )
+          (1): Sequential(
+            (0): Linear(in_features=256, out_features=256, bias=True)
+            (1): ReLU()
+          )
+        )
+      )
+      (postnet): Postnet(
+        (postnet): ModuleList(
+          (0): Sequential(
+            (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Tanh()
+            (3): Dropout(p=0.5, inplace=False)
+          )
+          (1-3): 3 x Sequential(
+            (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Tanh()
+            (3): Dropout(p=0.5, inplace=False)
+          )
+          (4): Sequential(
+            (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Dropout(p=0.5, inplace=False)
+          )
+        )
+      )
+      (feat_out): Linear(in_features=1536, out_features=240, bias=False)
+      (prob_out): Linear(in_features=1536, out_features=3, bias=True)
+    )
+    (taco2_loss): Tacotron2Loss(
+      (l1_criterion): L1Loss()
+      (mse_criterion): MSELoss()
+      (bce_criterion): BCEWithLogitsLoss()
+    )
+    (attn_loss): GuidedAttentionLoss()
+  )
+)
+Model summary:
+    Class Name: ESPnetTTSModel
+    Total Number of model parameters: 26.91 M
+    Number of trainable parameters: 26.91 M (100.0%)
+    Size: 107.63 MB
+    Type: torch.float32
+[7850374a3496] 2023-07-13 14:10:41,037 (abs_task:1207) INFO: Optimizer:
+Adam (
+Parameter Group 0
+    amsgrad: False
+    betas: (0.9, 0.999)
+    capturable: False
+    differentiable: False
+    eps: 1e-06
+    foreach: None
+    fused: None
+    lr: 0.001
+    maximize: False
+    weight_decay: 0.0
+)
+[7850374a3496] 2023-07-13 14:10:41,037 (abs_task:1208) INFO: Scheduler: None
+[7850374a3496] 2023-07-13 14:10:41,037 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.14/config.yaml
+[7850374a3496] 2023-07-13 14:10:41,061 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.14', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.14.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.14.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
+/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
+Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
+  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]
+# Accounting: time=13 threads=1
+# Ended (code 0) at Thu Jul 13 14:10:45 UTC 2023, elapsed time 13 seconds

exp/tts_stats_raw_phn_none/logdir/stats.15.log ADDED Viewed

	@@ -0,0 +1,116 @@

+# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.15.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.15.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.15 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
+# Started at Thu Jul 13 14:10:44 UTC 2023
+#
+/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
+  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
+/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.15.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.15.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.15 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
+[7850374a3496] 2023-07-13 14:10:52,286 (tts:293) INFO: Vocabulary size: 79
+[7850374a3496] 2023-07-13 14:10:52,979 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
+[7850374a3496] 2023-07-13 14:10:52,982 (abs_task:1204) INFO: Model structure:
+ESPnetTTSModel(
+  (feats_extract): LogMelFbank(
+    (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
+    (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
+  )
+  (tts): Tacotron2(
+    (enc): Encoder(
+      (embed): Embedding(79, 512, padding_idx=0)
+      (convs): ModuleList(
+        (0-2): 3 x Sequential(
+          (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+          (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          (2): ReLU()
+          (3): Dropout(p=0.5, inplace=False)
+        )
+      )
+      (blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
+    )
+    (dec): Decoder(
+      (att): AttLoc(
+        (mlp_enc): Linear(in_features=512, out_features=512, bias=True)
+        (mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
+        (mlp_att): Linear(in_features=32, out_features=512, bias=False)
+        (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
+        (gvec): Linear(in_features=512, out_features=1, bias=True)
+      )
+      (lstm): ModuleList(
+        (0): ZoneOutCell(
+          (cell): LSTMCell(768, 1024)
+        )
+        (1): ZoneOutCell(
+          (cell): LSTMCell(1024, 1024)
+        )
+      )
+      (prenet): Prenet(
+        (prenet): ModuleList(
+          (0): Sequential(
+            (0): Linear(in_features=80, out_features=256, bias=True)
+            (1): ReLU()
+          )
+          (1): Sequential(
+            (0): Linear(in_features=256, out_features=256, bias=True)
+            (1): ReLU()
+          )
+        )
+      )
+      (postnet): Postnet(
+        (postnet): ModuleList(
+          (0): Sequential(
+            (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Tanh()
+            (3): Dropout(p=0.5, inplace=False)
+          )
+          (1-3): 3 x Sequential(
+            (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Tanh()
+            (3): Dropout(p=0.5, inplace=False)
+          )
+          (4): Sequential(
+            (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Dropout(p=0.5, inplace=False)
+          )
+        )
+      )
+      (feat_out): Linear(in_features=1536, out_features=240, bias=False)
+      (prob_out): Linear(in_features=1536, out_features=3, bias=True)
+    )
+    (taco2_loss): Tacotron2Loss(
+      (l1_criterion): L1Loss()
+      (mse_criterion): MSELoss()
+      (bce_criterion): BCEWithLogitsLoss()
+    )
+    (attn_loss): GuidedAttentionLoss()
+  )
+)
+Model summary:
+    Class Name: ESPnetTTSModel
+    Total Number of model parameters: 26.91 M
+    Number of trainable parameters: 26.91 M (100.0%)
+    Size: 107.63 MB
+    Type: torch.float32
+[7850374a3496] 2023-07-13 14:10:52,982 (abs_task:1207) INFO: Optimizer:
+Adam (
+Parameter Group 0
+    amsgrad: False
+    betas: (0.9, 0.999)
+    capturable: False
+    differentiable: False
+    eps: 1e-06
+    foreach: None
+    fused: None
+    lr: 0.001
+    maximize: False
+    weight_decay: 0.0
+)
+[7850374a3496] 2023-07-13 14:10:52,982 (abs_task:1208) INFO: Scheduler: None
+[7850374a3496] 2023-07-13 14:10:52,982 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.15/config.yaml
+[7850374a3496] 2023-07-13 14:10:53,009 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.15', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.15.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.15.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
+/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
+Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
+  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]
+# Accounting: time=12 threads=1
+# Ended (code 0) at Thu Jul 13 14:10:56 UTC 2023, elapsed time 12 seconds

exp/tts_stats_raw_phn_none/logdir/stats.15/config.yaml ADDED Viewed

	@@ -0,0 +1,267 @@

+config: conf/tuning/finetune_tacotron2.yaml
+print_config: false
+log_level: INFO
+dry_run: false
+iterator_type: sequence
+output_dir: exp/tts_stats_raw_phn_none/logdir/stats.15
+ngpu: 0
+seed: 0
+num_workers: 1
+num_att_plot: 3
+dist_backend: nccl
+dist_init_method: env://
+dist_world_size: null
+dist_rank: null
+local_rank: null
+dist_master_addr: null
+dist_master_port: null
+dist_launcher: null
+multiprocessing_distributed: false
+unused_parameters: false
+sharded_ddp: false
+cudnn_enabled: true
+cudnn_benchmark: false
+cudnn_deterministic: true
+collect_stats: true
+write_collected_feats: false
+max_epoch: 120
+patience: null
+val_scheduler_criterion:
+- valid
+- loss
+early_stopping_criterion:
+- valid
+- loss
+- min
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
+keep_nbest_models: 5
+nbest_averaging_interval: 0
+grad_clip: 1.0
+grad_clip_type: 2.0
+grad_noise: false
+accum_grad: 1
+no_forward_run: false
+resume: false
+train_dtype: float32
+use_amp: false
+log_interval: null
+use_matplotlib: true
+use_tensorboard: true
+create_graph_in_tensorboard: false
+use_wandb: false
+wandb_project: null
+wandb_id: null
+wandb_entity: null
+wandb_name: null
+wandb_model_log_interval: -1
+detect_anomaly: false
+pretrain_path: null
+init_param: []
+ignore_init_mismatch: false
+freeze_param: []
+num_iters_per_epoch: 200
+batch_size: 20
+valid_batch_size: null
+batch_bins: 1600000
+valid_batch_bins: null
+train_shape_file:
+- exp/tts_stats_raw_phn_none/logdir/train.15.scp
+valid_shape_file:
+- exp/tts_stats_raw_phn_none/logdir/valid.15.scp
+batch_type: numel
+valid_batch_type: null
+fold_length: []
+sort_in_batch: descending
+sort_batch: descending
+multiple_iterator: false
+chunk_length: 500
+chunk_shift_ratio: 0.5
+num_cache_chunks: 1024
+chunk_excluded_key_prefixes: []
+train_data_path_and_name_and_type:
+-   - dump/raw/train/text
+    - text
+    - text
+-   - dump/raw/train/wav.scp
+    - speech
+    - sound
+valid_data_path_and_name_and_type:
+-   - dump/raw/dev/text
+    - text
+    - text
+-   - dump/raw/dev/wav.scp
+    - speech
+    - sound
+allow_variable_data_keys: false
+max_cache_size: 0.0
+max_cache_fd: 32
+valid_max_cache_size: null
+exclude_weight_decay: false
+exclude_weight_decay_conf: {}
+optim: adam
+optim_conf:
+    lr: 0.001
+    eps: 1.0e-06
+    weight_decay: 0.0
+scheduler: null
+scheduler_conf: {}
+token_list:
+- <blank>
+- <unk>
+- a
+- sil
+- l
+- aa
+- m
+- ii0
+- t
+- <
+- n
+- r
+- E
+- i0
+- b
+- uu0
+- f
+- i1
+- k
+- w
+- A
+- s
+- y
+- d
+- q
+- h
+- H
+- $
+- u0
+- AA
+- j
+- T
+- x
+- S
+- z
+- ll
+- I1
+- D
+- II0
+- g
+- tt
+- rr
+- I0
+- UU0
+- dd
+- u1
+- U0
+- mm
+- nn
+- '*'
+- $$
+- bb
+- yy
+- ss
+- jj
+- ww
+- ^
+- SS
+- TT
+- Z
+- zz
+- kk
+- U1
+- HH
+- ff
+- qq
+- xx
+- ^^
+- DD
+- hh
+- EE
+- ZZ
+- '**'
+- aaaa
+- ssss
+- v
+- uu1
+- jjjj
+- <sos/eos>
+odim: null
+model_conf: {}
+use_preprocessor: true
+token_type: phn
+bpemodel: null
+non_linguistic_symbols: null
+cleaner: null
+g2p: null
+feats_extract: fbank
+feats_extract_conf:
+    n_fft: 1024
+    hop_length: 256
+    win_length: null
+    fs: 22050
+    fmin: 80
+    fmax: 7600
+    n_mels: 80
+normalize: null
+normalize_conf: {}
+tts: tacotron2
+tts_conf:
+    embed_dim: 512
+    elayers: 1
+    eunits: 512
+    econv_layers: 3
+    econv_chans: 512
+    econv_filts: 5
+    atype: location
+    adim: 512
+    aconv_chans: 32
+    aconv_filts: 15
+    cumulate_att_w: true
+    dlayers: 2
+    dunits: 1024
+    prenet_layers: 2
+    prenet_units: 256
+    postnet_layers: 5
+    postnet_chans: 512
+    postnet_filts: 5
+    output_activation: null
+    use_batch_norm: true
+    use_concate: true
+    use_residual: false
+    dropout_rate: 0.5
+    zoneout_rate: 0.1
+    reduction_factor: 3
+    spk_embed_dim: null
+    use_masking: true
+    bce_pos_weight: 20.0
+    use_guided_attn_loss: true
+    guided_attn_loss_sigma: 0.4
+    guided_attn_loss_lambda: 1.0
+pitch_extract: null
+pitch_extract_conf:
+    fs: 22050
+    n_fft: 1024
+    hop_length: 256
+    f0max: 400
+    f0min: 80
+pitch_normalize: null
+pitch_normalize_conf: {}
+energy_extract: null
+energy_extract_conf:
+    fs: 22050
+    n_fft: 1024
+    hop_length: 256
+    win_length: null
+energy_normalize: null
+energy_normalize_conf: {}
+required:
+- output_dir
+- token_list
+version: '202304'
+distributed: false

exp/tts_stats_raw_phn_none/logdir/stats.15/train/batch_keys ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ text
2	+ speech

exp/tts_stats_raw_phn_none/logdir/stats.15/train/feats_lengths_stats.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b7321393931081a400396aafc1edb9605c0808638ac13716f0f23942f51e167a
+size 778

exp/tts_stats_raw_phn_none/logdir/stats.15/train/feats_stats.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:239fc3342b13b16bd839d1ac8b21666aad2e96b1b2341e9c6c4c53e063f99526
+size 1402

exp/tts_stats_raw_phn_none/logdir/stats.15/train/speech_shape ADDED Viewed

	@@ -0,0 +1,43 @@

+19928 141056
+19931 133376
+19935 203520
+19938 102144
+19944 126464
+19946 116992
+19947 154112
+19948 171637
+19949 141056
+19951 214272
+19952 165376
+19955 134912
+19957 150596
+19959 176896
+19976 169472
+19979 119808
+19981 134144
+19984 171520
+19990 235008
+19998 195840
+200 125440
+20001 184576
+20005 108032
+20020 164608
+20022 235264
+20029 174080
+20038 216576
+20042 241920
+20051 203776
+20055 168448
+20062 152064
+20080 219136
+20087 116992
+20095 193792
+201 119040
+20109 167424
+20119 149760
+20120 154368
+20121 172288
+20128 143872
+20144 112128
+20147 167168
+20183 139520

exp/tts_stats_raw_phn_none/logdir/stats.15/train/stats_keys ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ feats
2	+ feats_lengths

exp/tts_stats_raw_phn_none/logdir/stats.15/train/text_shape ADDED Viewed

	@@ -0,0 +1,43 @@

+19928 71
+19931 84
+19935 117
+19938 58
+19944 71
+19946 52
+19947 71
+19948 80
+19949 64
+19951 133
+19952 102
+19955 64
+19957 81
+19959 106
+19976 97
+19979 66
+19981 66
+19984 80
+19990 127
+19998 100
+200 64
+20001 98
+20005 61
+20020 68
+20022 143
+20029 103
+20038 123
+20042 136
+20051 106
+20055 97
+20062 90
+20080 124
+20087 52
+20095 101
+201 67
+20109 82
+20119 64
+20120 93
+20121 83
+20128 67
+20144 69
+20147 78
+20183 70

exp/tts_stats_raw_phn_none/logdir/stats.15/valid/batch_keys ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ text
2	+ speech

exp/tts_stats_raw_phn_none/logdir/stats.15/valid/feats_lengths_stats.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ab01415ef2a97eaa04e81355080bc38b3f9b0343f8e97e91044090b6ff63685
+size 778

exp/tts_stats_raw_phn_none/logdir/stats.15/valid/feats_stats.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89cf49faa040cf392b87903167b64d4599f1d322f1a2937c91823fca48e139a9
+size 1402

exp/tts_stats_raw_phn_none/logdir/stats.15/valid/speech_shape ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 19769 152064
2	+ 19771 194816

exp/tts_stats_raw_phn_none/logdir/stats.15/valid/stats_keys ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ feats
2	+ feats_lengths

exp/tts_stats_raw_phn_none/logdir/stats.15/valid/text_shape ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ 19769 84
2	+ 19771 108

exp/tts_stats_raw_phn_none/logdir/stats.17.log ADDED Viewed

	@@ -0,0 +1,116 @@

+# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.17.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.17.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.17 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
+# Started at Thu Jul 13 14:10:56 UTC 2023
+#
+/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
+  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
+/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.17.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.17.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.17 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
+[7850374a3496] 2023-07-13 14:11:04,338 (tts:293) INFO: Vocabulary size: 79
+[7850374a3496] 2023-07-13 14:11:05,061 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
+[7850374a3496] 2023-07-13 14:11:05,064 (abs_task:1204) INFO: Model structure:
+ESPnetTTSModel(
+  (feats_extract): LogMelFbank(
+    (stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
+    (logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
+  )
+  (tts): Tacotron2(
+    (enc): Encoder(
+      (embed): Embedding(79, 512, padding_idx=0)
+      (convs): ModuleList(
+        (0-2): 3 x Sequential(
+          (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+          (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+          (2): ReLU()
+          (3): Dropout(p=0.5, inplace=False)
+        )
+      )
+      (blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
+    )
+    (dec): Decoder(
+      (att): AttLoc(
+        (mlp_enc): Linear(in_features=512, out_features=512, bias=True)
+        (mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
+        (mlp_att): Linear(in_features=32, out_features=512, bias=False)
+        (loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
+        (gvec): Linear(in_features=512, out_features=1, bias=True)
+      )
+      (lstm): ModuleList(
+        (0): ZoneOutCell(
+          (cell): LSTMCell(768, 1024)
+        )
+        (1): ZoneOutCell(
+          (cell): LSTMCell(1024, 1024)
+        )
+      )
+      (prenet): Prenet(
+        (prenet): ModuleList(
+          (0): Sequential(
+            (0): Linear(in_features=80, out_features=256, bias=True)
+            (1): ReLU()
+          )
+          (1): Sequential(
+            (0): Linear(in_features=256, out_features=256, bias=True)
+            (1): ReLU()
+          )
+        )
+      )
+      (postnet): Postnet(
+        (postnet): ModuleList(
+          (0): Sequential(
+            (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Tanh()
+            (3): Dropout(p=0.5, inplace=False)
+          )
+          (1-3): 3 x Sequential(
+            (0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Tanh()
+            (3): Dropout(p=0.5, inplace=False)
+          )
+          (4): Sequential(
+            (0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
+            (1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
+            (2): Dropout(p=0.5, inplace=False)
+          )
+        )
+      )
+      (feat_out): Linear(in_features=1536, out_features=240, bias=False)
+      (prob_out): Linear(in_features=1536, out_features=3, bias=True)
+    )
+    (taco2_loss): Tacotron2Loss(
+      (l1_criterion): L1Loss()
+      (mse_criterion): MSELoss()
+      (bce_criterion): BCEWithLogitsLoss()
+    )
+    (attn_loss): GuidedAttentionLoss()
+  )
+)
+Model summary:
+    Class Name: ESPnetTTSModel
+    Total Number of model parameters: 26.91 M
+    Number of trainable parameters: 26.91 M (100.0%)
+    Size: 107.63 MB
+    Type: torch.float32
+[7850374a3496] 2023-07-13 14:11:05,064 (abs_task:1207) INFO: Optimizer:
+Adam (
+Parameter Group 0
+    amsgrad: False
+    betas: (0.9, 0.999)
+    capturable: False
+    differentiable: False
+    eps: 1e-06
+    foreach: None
+    fused: None
+    lr: 0.001
+    maximize: False
+    weight_decay: 0.0
+)
+[7850374a3496] 2023-07-13 14:11:05,064 (abs_task:1208) INFO: Scheduler: None
+[7850374a3496] 2023-07-13 14:11:05,065 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.17/config.yaml
+[7850374a3496] 2023-07-13 14:11:05,100 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.17', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.17.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.17.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
+/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
+Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
+  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]
+# Accounting: time=13 threads=1
+# Ended (code 0) at Thu Jul 13 14:11:09 UTC 2023, elapsed time 13 seconds

exp/tts_stats_raw_phn_none/logdir/stats.17/config.yaml ADDED Viewed

	@@ -0,0 +1,267 @@

+config: conf/tuning/finetune_tacotron2.yaml
+print_config: false
+log_level: INFO
+dry_run: false
+iterator_type: sequence
+output_dir: exp/tts_stats_raw_phn_none/logdir/stats.17
+ngpu: 0
+seed: 0
+num_workers: 1
+num_att_plot: 3
+dist_backend: nccl
+dist_init_method: env://
+dist_world_size: null
+dist_rank: null
+local_rank: null
+dist_master_addr: null
+dist_master_port: null
+dist_launcher: null
+multiprocessing_distributed: false
+unused_parameters: false
+sharded_ddp: false
+cudnn_enabled: true
+cudnn_benchmark: false
+cudnn_deterministic: true
+collect_stats: true
+write_collected_feats: false
+max_epoch: 120
+patience: null
+val_scheduler_criterion:
+- valid
+- loss
+early_stopping_criterion:
+- valid
+- loss
+- min
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
+keep_nbest_models: 5
+nbest_averaging_interval: 0
+grad_clip: 1.0
+grad_clip_type: 2.0
+grad_noise: false
+accum_grad: 1
+no_forward_run: false
+resume: false
+train_dtype: float32
+use_amp: false
+log_interval: null
+use_matplotlib: true
+use_tensorboard: true
+create_graph_in_tensorboard: false
+use_wandb: false
+wandb_project: null
+wandb_id: null
+wandb_entity: null
+wandb_name: null
+wandb_model_log_interval: -1
+detect_anomaly: false
+pretrain_path: null
+init_param: []
+ignore_init_mismatch: false
+freeze_param: []
+num_iters_per_epoch: 200
+batch_size: 20
+valid_batch_size: null
+batch_bins: 1600000
+valid_batch_bins: null
+train_shape_file:
+- exp/tts_stats_raw_phn_none/logdir/train.17.scp
+valid_shape_file:
+- exp/tts_stats_raw_phn_none/logdir/valid.17.scp
+batch_type: numel
+valid_batch_type: null
+fold_length: []
+sort_in_batch: descending
+sort_batch: descending
+multiple_iterator: false
+chunk_length: 500
+chunk_shift_ratio: 0.5
+num_cache_chunks: 1024
+chunk_excluded_key_prefixes: []
+train_data_path_and_name_and_type:
+-   - dump/raw/train/text
+    - text
+    - text
+-   - dump/raw/train/wav.scp
+    - speech
+    - sound
+valid_data_path_and_name_and_type:
+-   - dump/raw/dev/text
+    - text
+    - text
+-   - dump/raw/dev/wav.scp
+    - speech
+    - sound
+allow_variable_data_keys: false
+max_cache_size: 0.0
+max_cache_fd: 32
+valid_max_cache_size: null
+exclude_weight_decay: false
+exclude_weight_decay_conf: {}
+optim: adam
+optim_conf:
+    lr: 0.001
+    eps: 1.0e-06
+    weight_decay: 0.0
+scheduler: null
+scheduler_conf: {}
+token_list:
+- <blank>
+- <unk>
+- a
+- sil
+- l
+- aa
+- m
+- ii0
+- t
+- <
+- n
+- r
+- E
+- i0
+- b
+- uu0
+- f
+- i1
+- k
+- w
+- A
+- s
+- y
+- d
+- q
+- h
+- H
+- $
+- u0
+- AA
+- j
+- T
+- x
+- S
+- z
+- ll
+- I1
+- D
+- II0
+- g
+- tt
+- rr
+- I0
+- UU0
+- dd
+- u1
+- U0
+- mm
+- nn
+- '*'
+- $$
+- bb
+- yy
+- ss
+- jj
+- ww
+- ^
+- SS
+- TT
+- Z
+- zz
+- kk
+- U1
+- HH
+- ff
+- qq
+- xx
+- ^^
+- DD
+- hh
+- EE
+- ZZ
+- '**'
+- aaaa
+- ssss
+- v
+- uu1
+- jjjj
+- <sos/eos>
+odim: null
+model_conf: {}
+use_preprocessor: true
+token_type: phn
+bpemodel: null
+non_linguistic_symbols: null
+cleaner: null
+g2p: null
+feats_extract: fbank
+feats_extract_conf:
+    n_fft: 1024
+    hop_length: 256
+    win_length: null
+    fs: 22050
+    fmin: 80
+    fmax: 7600
+    n_mels: 80
+normalize: null
+normalize_conf: {}
+tts: tacotron2
+tts_conf:
+    embed_dim: 512
+    elayers: 1
+    eunits: 512
+    econv_layers: 3
+    econv_chans: 512
+    econv_filts: 5
+    atype: location
+    adim: 512
+    aconv_chans: 32
+    aconv_filts: 15
+    cumulate_att_w: true
+    dlayers: 2
+    dunits: 1024
+    prenet_layers: 2
+    prenet_units: 256
+    postnet_layers: 5
+    postnet_chans: 512
+    postnet_filts: 5
+    output_activation: null
+    use_batch_norm: true
+    use_concate: true
+    use_residual: false
+    dropout_rate: 0.5
+    zoneout_rate: 0.1
+    reduction_factor: 3
+    spk_embed_dim: null
+    use_masking: true
+    bce_pos_weight: 20.0
+    use_guided_attn_loss: true
+    guided_attn_loss_sigma: 0.4
+    guided_attn_loss_lambda: 1.0
+pitch_extract: null
+pitch_extract_conf:
+    fs: 22050
+    n_fft: 1024
+    hop_length: 256
+    f0max: 400
+    f0min: 80
+pitch_normalize: null
+pitch_normalize_conf: {}
+energy_extract: null
+energy_extract_conf:
+    fs: 22050
+    n_fft: 1024
+    hop_length: 256
+    win_length: null
+energy_normalize: null
+energy_normalize_conf: {}
+required:
+- output_dir
+- token_list
+version: '202304'
+distributed: false

exp/tts_stats_raw_phn_none/logdir/stats.17/train/batch_keys ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ text
2	+ speech

exp/tts_stats_raw_phn_none/logdir/stats.17/train/feats_lengths_stats.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:503b8c916b78f4942fd868b9337455d0a4593217bf37efc50c5e8192e7949a22
+size 778

exp/tts_stats_raw_phn_none/logdir/stats.17/train/feats_stats.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:242283537138ddfc5699bdd17945a6d6bf4a95ff1d368666989414d0b47ca626
+size 1402