Upload 423 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- exp/tts_stats_raw_phn_none/logdir/stats.1.log +116 -0
- exp/tts_stats_raw_phn_none/logdir/stats.10/config.yaml +267 -0
- exp/tts_stats_raw_phn_none/logdir/stats.10/train/batch_keys +2 -0
- exp/tts_stats_raw_phn_none/logdir/stats.10/train/feats_lengths_stats.npz +3 -0
- exp/tts_stats_raw_phn_none/logdir/stats.10/train/feats_stats.npz +3 -0
- exp/tts_stats_raw_phn_none/logdir/stats.10/train/speech_shape +43 -0
- exp/tts_stats_raw_phn_none/logdir/stats.10/train/stats_keys +2 -0
- exp/tts_stats_raw_phn_none/logdir/stats.10/train/text_shape +43 -0
- exp/tts_stats_raw_phn_none/logdir/stats.10/valid/batch_keys +2 -0
- exp/tts_stats_raw_phn_none/logdir/stats.10/valid/feats_lengths_stats.npz +3 -0
- exp/tts_stats_raw_phn_none/logdir/stats.10/valid/feats_stats.npz +3 -0
- exp/tts_stats_raw_phn_none/logdir/stats.10/valid/speech_shape +2 -0
- exp/tts_stats_raw_phn_none/logdir/stats.10/valid/stats_keys +2 -0
- exp/tts_stats_raw_phn_none/logdir/stats.10/valid/text_shape +2 -0
- exp/tts_stats_raw_phn_none/logdir/stats.11.log +116 -0
- exp/tts_stats_raw_phn_none/logdir/stats.12.log +116 -0
- exp/tts_stats_raw_phn_none/logdir/stats.12/config.yaml +267 -0
- exp/tts_stats_raw_phn_none/logdir/stats.12/train/batch_keys +2 -0
- exp/tts_stats_raw_phn_none/logdir/stats.12/train/feats_lengths_stats.npz +3 -0
- exp/tts_stats_raw_phn_none/logdir/stats.12/train/feats_stats.npz +3 -0
- exp/tts_stats_raw_phn_none/logdir/stats.12/train/speech_shape +43 -0
- exp/tts_stats_raw_phn_none/logdir/stats.12/train/stats_keys +2 -0
- exp/tts_stats_raw_phn_none/logdir/stats.12/train/text_shape +43 -0
- exp/tts_stats_raw_phn_none/logdir/stats.12/valid/batch_keys +2 -0
- exp/tts_stats_raw_phn_none/logdir/stats.12/valid/feats_lengths_stats.npz +3 -0
- exp/tts_stats_raw_phn_none/logdir/stats.12/valid/feats_stats.npz +3 -0
- exp/tts_stats_raw_phn_none/logdir/stats.12/valid/speech_shape +2 -0
- exp/tts_stats_raw_phn_none/logdir/stats.12/valid/stats_keys +2 -0
- exp/tts_stats_raw_phn_none/logdir/stats.12/valid/text_shape +2 -0
- exp/tts_stats_raw_phn_none/logdir/stats.13.log +116 -0
- exp/tts_stats_raw_phn_none/logdir/stats.14.log +116 -0
- exp/tts_stats_raw_phn_none/logdir/stats.15.log +116 -0
- exp/tts_stats_raw_phn_none/logdir/stats.15/config.yaml +267 -0
- exp/tts_stats_raw_phn_none/logdir/stats.15/train/batch_keys +2 -0
- exp/tts_stats_raw_phn_none/logdir/stats.15/train/feats_lengths_stats.npz +3 -0
- exp/tts_stats_raw_phn_none/logdir/stats.15/train/feats_stats.npz +3 -0
- exp/tts_stats_raw_phn_none/logdir/stats.15/train/speech_shape +43 -0
- exp/tts_stats_raw_phn_none/logdir/stats.15/train/stats_keys +2 -0
- exp/tts_stats_raw_phn_none/logdir/stats.15/train/text_shape +43 -0
- exp/tts_stats_raw_phn_none/logdir/stats.15/valid/batch_keys +2 -0
- exp/tts_stats_raw_phn_none/logdir/stats.15/valid/feats_lengths_stats.npz +3 -0
- exp/tts_stats_raw_phn_none/logdir/stats.15/valid/feats_stats.npz +3 -0
- exp/tts_stats_raw_phn_none/logdir/stats.15/valid/speech_shape +2 -0
- exp/tts_stats_raw_phn_none/logdir/stats.15/valid/stats_keys +2 -0
- exp/tts_stats_raw_phn_none/logdir/stats.15/valid/text_shape +2 -0
- exp/tts_stats_raw_phn_none/logdir/stats.17.log +116 -0
- exp/tts_stats_raw_phn_none/logdir/stats.17/config.yaml +267 -0
- exp/tts_stats_raw_phn_none/logdir/stats.17/train/batch_keys +2 -0
- exp/tts_stats_raw_phn_none/logdir/stats.17/train/feats_lengths_stats.npz +3 -0
- exp/tts_stats_raw_phn_none/logdir/stats.17/train/feats_stats.npz +3 -0
exp/tts_stats_raw_phn_none/logdir/stats.1.log
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.1.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.1.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.1 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
|
2 |
+
# Started at Thu Jul 13 14:09:11 UTC 2023
|
3 |
+
#
|
4 |
+
/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
|
5 |
+
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
|
6 |
+
/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.1.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.1.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.1 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
|
7 |
+
[7850374a3496] 2023-07-13 14:09:21,971 (tts:293) INFO: Vocabulary size: 79
|
8 |
+
[7850374a3496] 2023-07-13 14:09:22,770 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
|
9 |
+
[7850374a3496] 2023-07-13 14:09:22,773 (abs_task:1204) INFO: Model structure:
|
10 |
+
ESPnetTTSModel(
|
11 |
+
(feats_extract): LogMelFbank(
|
12 |
+
(stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
|
13 |
+
(logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
|
14 |
+
)
|
15 |
+
(tts): Tacotron2(
|
16 |
+
(enc): Encoder(
|
17 |
+
(embed): Embedding(79, 512, padding_idx=0)
|
18 |
+
(convs): ModuleList(
|
19 |
+
(0-2): 3 x Sequential(
|
20 |
+
(0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
21 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
22 |
+
(2): ReLU()
|
23 |
+
(3): Dropout(p=0.5, inplace=False)
|
24 |
+
)
|
25 |
+
)
|
26 |
+
(blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
|
27 |
+
)
|
28 |
+
(dec): Decoder(
|
29 |
+
(att): AttLoc(
|
30 |
+
(mlp_enc): Linear(in_features=512, out_features=512, bias=True)
|
31 |
+
(mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
|
32 |
+
(mlp_att): Linear(in_features=32, out_features=512, bias=False)
|
33 |
+
(loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
|
34 |
+
(gvec): Linear(in_features=512, out_features=1, bias=True)
|
35 |
+
)
|
36 |
+
(lstm): ModuleList(
|
37 |
+
(0): ZoneOutCell(
|
38 |
+
(cell): LSTMCell(768, 1024)
|
39 |
+
)
|
40 |
+
(1): ZoneOutCell(
|
41 |
+
(cell): LSTMCell(1024, 1024)
|
42 |
+
)
|
43 |
+
)
|
44 |
+
(prenet): Prenet(
|
45 |
+
(prenet): ModuleList(
|
46 |
+
(0): Sequential(
|
47 |
+
(0): Linear(in_features=80, out_features=256, bias=True)
|
48 |
+
(1): ReLU()
|
49 |
+
)
|
50 |
+
(1): Sequential(
|
51 |
+
(0): Linear(in_features=256, out_features=256, bias=True)
|
52 |
+
(1): ReLU()
|
53 |
+
)
|
54 |
+
)
|
55 |
+
)
|
56 |
+
(postnet): Postnet(
|
57 |
+
(postnet): ModuleList(
|
58 |
+
(0): Sequential(
|
59 |
+
(0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
60 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
61 |
+
(2): Tanh()
|
62 |
+
(3): Dropout(p=0.5, inplace=False)
|
63 |
+
)
|
64 |
+
(1-3): 3 x Sequential(
|
65 |
+
(0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
66 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
67 |
+
(2): Tanh()
|
68 |
+
(3): Dropout(p=0.5, inplace=False)
|
69 |
+
)
|
70 |
+
(4): Sequential(
|
71 |
+
(0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
72 |
+
(1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
73 |
+
(2): Dropout(p=0.5, inplace=False)
|
74 |
+
)
|
75 |
+
)
|
76 |
+
)
|
77 |
+
(feat_out): Linear(in_features=1536, out_features=240, bias=False)
|
78 |
+
(prob_out): Linear(in_features=1536, out_features=3, bias=True)
|
79 |
+
)
|
80 |
+
(taco2_loss): Tacotron2Loss(
|
81 |
+
(l1_criterion): L1Loss()
|
82 |
+
(mse_criterion): MSELoss()
|
83 |
+
(bce_criterion): BCEWithLogitsLoss()
|
84 |
+
)
|
85 |
+
(attn_loss): GuidedAttentionLoss()
|
86 |
+
)
|
87 |
+
)
|
88 |
+
|
89 |
+
Model summary:
|
90 |
+
Class Name: ESPnetTTSModel
|
91 |
+
Total Number of model parameters: 26.91 M
|
92 |
+
Number of trainable parameters: 26.91 M (100.0%)
|
93 |
+
Size: 107.63 MB
|
94 |
+
Type: torch.float32
|
95 |
+
[7850374a3496] 2023-07-13 14:09:22,773 (abs_task:1207) INFO: Optimizer:
|
96 |
+
Adam (
|
97 |
+
Parameter Group 0
|
98 |
+
amsgrad: False
|
99 |
+
betas: (0.9, 0.999)
|
100 |
+
capturable: False
|
101 |
+
differentiable: False
|
102 |
+
eps: 1e-06
|
103 |
+
foreach: None
|
104 |
+
fused: None
|
105 |
+
lr: 0.001
|
106 |
+
maximize: False
|
107 |
+
weight_decay: 0.0
|
108 |
+
)
|
109 |
+
[7850374a3496] 2023-07-13 14:09:22,773 (abs_task:1208) INFO: Scheduler: None
|
110 |
+
[7850374a3496] 2023-07-13 14:09:22,773 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.1/config.yaml
|
111 |
+
[7850374a3496] 2023-07-13 14:09:22,799 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.1', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.1.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.1.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
|
112 |
+
/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
|
113 |
+
Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
|
114 |
+
return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined]
|
115 |
+
# Accounting: time=16 threads=1
|
116 |
+
# Ended (code 0) at Thu Jul 13 14:09:27 UTC 2023, elapsed time 16 seconds
|
exp/tts_stats_raw_phn_none/logdir/stats.10/config.yaml
ADDED
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
config: conf/tuning/finetune_tacotron2.yaml
|
2 |
+
print_config: false
|
3 |
+
log_level: INFO
|
4 |
+
dry_run: false
|
5 |
+
iterator_type: sequence
|
6 |
+
output_dir: exp/tts_stats_raw_phn_none/logdir/stats.10
|
7 |
+
ngpu: 0
|
8 |
+
seed: 0
|
9 |
+
num_workers: 1
|
10 |
+
num_att_plot: 3
|
11 |
+
dist_backend: nccl
|
12 |
+
dist_init_method: env://
|
13 |
+
dist_world_size: null
|
14 |
+
dist_rank: null
|
15 |
+
local_rank: null
|
16 |
+
dist_master_addr: null
|
17 |
+
dist_master_port: null
|
18 |
+
dist_launcher: null
|
19 |
+
multiprocessing_distributed: false
|
20 |
+
unused_parameters: false
|
21 |
+
sharded_ddp: false
|
22 |
+
cudnn_enabled: true
|
23 |
+
cudnn_benchmark: false
|
24 |
+
cudnn_deterministic: true
|
25 |
+
collect_stats: true
|
26 |
+
write_collected_feats: false
|
27 |
+
max_epoch: 120
|
28 |
+
patience: null
|
29 |
+
val_scheduler_criterion:
|
30 |
+
- valid
|
31 |
+
- loss
|
32 |
+
early_stopping_criterion:
|
33 |
+
- valid
|
34 |
+
- loss
|
35 |
+
- min
|
36 |
+
best_model_criterion:
|
37 |
+
- - valid
|
38 |
+
- loss
|
39 |
+
- min
|
40 |
+
- - train
|
41 |
+
- loss
|
42 |
+
- min
|
43 |
+
keep_nbest_models: 5
|
44 |
+
nbest_averaging_interval: 0
|
45 |
+
grad_clip: 1.0
|
46 |
+
grad_clip_type: 2.0
|
47 |
+
grad_noise: false
|
48 |
+
accum_grad: 1
|
49 |
+
no_forward_run: false
|
50 |
+
resume: false
|
51 |
+
train_dtype: float32
|
52 |
+
use_amp: false
|
53 |
+
log_interval: null
|
54 |
+
use_matplotlib: true
|
55 |
+
use_tensorboard: true
|
56 |
+
create_graph_in_tensorboard: false
|
57 |
+
use_wandb: false
|
58 |
+
wandb_project: null
|
59 |
+
wandb_id: null
|
60 |
+
wandb_entity: null
|
61 |
+
wandb_name: null
|
62 |
+
wandb_model_log_interval: -1
|
63 |
+
detect_anomaly: false
|
64 |
+
pretrain_path: null
|
65 |
+
init_param: []
|
66 |
+
ignore_init_mismatch: false
|
67 |
+
freeze_param: []
|
68 |
+
num_iters_per_epoch: 200
|
69 |
+
batch_size: 20
|
70 |
+
valid_batch_size: null
|
71 |
+
batch_bins: 1600000
|
72 |
+
valid_batch_bins: null
|
73 |
+
train_shape_file:
|
74 |
+
- exp/tts_stats_raw_phn_none/logdir/train.10.scp
|
75 |
+
valid_shape_file:
|
76 |
+
- exp/tts_stats_raw_phn_none/logdir/valid.10.scp
|
77 |
+
batch_type: numel
|
78 |
+
valid_batch_type: null
|
79 |
+
fold_length: []
|
80 |
+
sort_in_batch: descending
|
81 |
+
sort_batch: descending
|
82 |
+
multiple_iterator: false
|
83 |
+
chunk_length: 500
|
84 |
+
chunk_shift_ratio: 0.5
|
85 |
+
num_cache_chunks: 1024
|
86 |
+
chunk_excluded_key_prefixes: []
|
87 |
+
train_data_path_and_name_and_type:
|
88 |
+
- - dump/raw/train/text
|
89 |
+
- text
|
90 |
+
- text
|
91 |
+
- - dump/raw/train/wav.scp
|
92 |
+
- speech
|
93 |
+
- sound
|
94 |
+
valid_data_path_and_name_and_type:
|
95 |
+
- - dump/raw/dev/text
|
96 |
+
- text
|
97 |
+
- text
|
98 |
+
- - dump/raw/dev/wav.scp
|
99 |
+
- speech
|
100 |
+
- sound
|
101 |
+
allow_variable_data_keys: false
|
102 |
+
max_cache_size: 0.0
|
103 |
+
max_cache_fd: 32
|
104 |
+
valid_max_cache_size: null
|
105 |
+
exclude_weight_decay: false
|
106 |
+
exclude_weight_decay_conf: {}
|
107 |
+
optim: adam
|
108 |
+
optim_conf:
|
109 |
+
lr: 0.001
|
110 |
+
eps: 1.0e-06
|
111 |
+
weight_decay: 0.0
|
112 |
+
scheduler: null
|
113 |
+
scheduler_conf: {}
|
114 |
+
token_list:
|
115 |
+
- <blank>
|
116 |
+
- <unk>
|
117 |
+
- a
|
118 |
+
- sil
|
119 |
+
- l
|
120 |
+
- aa
|
121 |
+
- m
|
122 |
+
- ii0
|
123 |
+
- t
|
124 |
+
- <
|
125 |
+
- n
|
126 |
+
- r
|
127 |
+
- E
|
128 |
+
- i0
|
129 |
+
- b
|
130 |
+
- uu0
|
131 |
+
- f
|
132 |
+
- i1
|
133 |
+
- k
|
134 |
+
- w
|
135 |
+
- A
|
136 |
+
- s
|
137 |
+
- y
|
138 |
+
- d
|
139 |
+
- q
|
140 |
+
- h
|
141 |
+
- H
|
142 |
+
- $
|
143 |
+
- u0
|
144 |
+
- AA
|
145 |
+
- j
|
146 |
+
- T
|
147 |
+
- x
|
148 |
+
- S
|
149 |
+
- z
|
150 |
+
- ll
|
151 |
+
- I1
|
152 |
+
- D
|
153 |
+
- II0
|
154 |
+
- g
|
155 |
+
- tt
|
156 |
+
- rr
|
157 |
+
- I0
|
158 |
+
- UU0
|
159 |
+
- dd
|
160 |
+
- u1
|
161 |
+
- U0
|
162 |
+
- mm
|
163 |
+
- nn
|
164 |
+
- '*'
|
165 |
+
- $$
|
166 |
+
- bb
|
167 |
+
- yy
|
168 |
+
- ss
|
169 |
+
- jj
|
170 |
+
- ww
|
171 |
+
- ^
|
172 |
+
- SS
|
173 |
+
- TT
|
174 |
+
- Z
|
175 |
+
- zz
|
176 |
+
- kk
|
177 |
+
- U1
|
178 |
+
- HH
|
179 |
+
- ff
|
180 |
+
- qq
|
181 |
+
- xx
|
182 |
+
- ^^
|
183 |
+
- DD
|
184 |
+
- hh
|
185 |
+
- EE
|
186 |
+
- ZZ
|
187 |
+
- '**'
|
188 |
+
- aaaa
|
189 |
+
- ssss
|
190 |
+
- v
|
191 |
+
- uu1
|
192 |
+
- jjjj
|
193 |
+
- <sos/eos>
|
194 |
+
odim: null
|
195 |
+
model_conf: {}
|
196 |
+
use_preprocessor: true
|
197 |
+
token_type: phn
|
198 |
+
bpemodel: null
|
199 |
+
non_linguistic_symbols: null
|
200 |
+
cleaner: null
|
201 |
+
g2p: null
|
202 |
+
feats_extract: fbank
|
203 |
+
feats_extract_conf:
|
204 |
+
n_fft: 1024
|
205 |
+
hop_length: 256
|
206 |
+
win_length: null
|
207 |
+
fs: 22050
|
208 |
+
fmin: 80
|
209 |
+
fmax: 7600
|
210 |
+
n_mels: 80
|
211 |
+
normalize: null
|
212 |
+
normalize_conf: {}
|
213 |
+
tts: tacotron2
|
214 |
+
tts_conf:
|
215 |
+
embed_dim: 512
|
216 |
+
elayers: 1
|
217 |
+
eunits: 512
|
218 |
+
econv_layers: 3
|
219 |
+
econv_chans: 512
|
220 |
+
econv_filts: 5
|
221 |
+
atype: location
|
222 |
+
adim: 512
|
223 |
+
aconv_chans: 32
|
224 |
+
aconv_filts: 15
|
225 |
+
cumulate_att_w: true
|
226 |
+
dlayers: 2
|
227 |
+
dunits: 1024
|
228 |
+
prenet_layers: 2
|
229 |
+
prenet_units: 256
|
230 |
+
postnet_layers: 5
|
231 |
+
postnet_chans: 512
|
232 |
+
postnet_filts: 5
|
233 |
+
output_activation: null
|
234 |
+
use_batch_norm: true
|
235 |
+
use_concate: true
|
236 |
+
use_residual: false
|
237 |
+
dropout_rate: 0.5
|
238 |
+
zoneout_rate: 0.1
|
239 |
+
reduction_factor: 3
|
240 |
+
spk_embed_dim: null
|
241 |
+
use_masking: true
|
242 |
+
bce_pos_weight: 20.0
|
243 |
+
use_guided_attn_loss: true
|
244 |
+
guided_attn_loss_sigma: 0.4
|
245 |
+
guided_attn_loss_lambda: 1.0
|
246 |
+
pitch_extract: null
|
247 |
+
pitch_extract_conf:
|
248 |
+
fs: 22050
|
249 |
+
n_fft: 1024
|
250 |
+
hop_length: 256
|
251 |
+
f0max: 400
|
252 |
+
f0min: 80
|
253 |
+
pitch_normalize: null
|
254 |
+
pitch_normalize_conf: {}
|
255 |
+
energy_extract: null
|
256 |
+
energy_extract_conf:
|
257 |
+
fs: 22050
|
258 |
+
n_fft: 1024
|
259 |
+
hop_length: 256
|
260 |
+
win_length: null
|
261 |
+
energy_normalize: null
|
262 |
+
energy_normalize_conf: {}
|
263 |
+
required:
|
264 |
+
- output_dir
|
265 |
+
- token_list
|
266 |
+
version: '202304'
|
267 |
+
distributed: false
|
exp/tts_stats_raw_phn_none/logdir/stats.10/train/batch_keys
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
text
|
2 |
+
speech
|
exp/tts_stats_raw_phn_none/logdir/stats.10/train/feats_lengths_stats.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8d6b9f2fd6232f4b0ca33457b5d22c02d2b17b34d24e2f9f1f2415b0ec8a15f0
|
3 |
+
size 778
|
exp/tts_stats_raw_phn_none/logdir/stats.10/train/feats_stats.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:057d10f13786abd5b7b6b90bea854b18ad227d34f19bb8092c488f864880dd51
|
3 |
+
size 1402
|
exp/tts_stats_raw_phn_none/logdir/stats.10/train/speech_shape
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
18935 142336
|
2 |
+
18936 141568
|
3 |
+
18943 175360
|
4 |
+
18944 173824
|
5 |
+
18947 190208
|
6 |
+
18951 154368
|
7 |
+
18955 233216
|
8 |
+
18959 226560
|
9 |
+
18964 163584
|
10 |
+
18982 113664
|
11 |
+
18989 163072
|
12 |
+
18991 212480
|
13 |
+
18993 175872
|
14 |
+
18997 101888
|
15 |
+
19 122880
|
16 |
+
19001 217088
|
17 |
+
19005 184832
|
18 |
+
19010 156928
|
19 |
+
19011 175872
|
20 |
+
19015 139520
|
21 |
+
19024 165888
|
22 |
+
19028 158720
|
23 |
+
19063 187136
|
24 |
+
19065 144128
|
25 |
+
19067 175616
|
26 |
+
19075 163584
|
27 |
+
19076 214784
|
28 |
+
19090 172544
|
29 |
+
19091 199936
|
30 |
+
19095 118016
|
31 |
+
19096 165888
|
32 |
+
19099 159488
|
33 |
+
191 134144
|
34 |
+
19103 124416
|
35 |
+
19109 132352
|
36 |
+
19111 151740
|
37 |
+
19113 129280
|
38 |
+
19116 155648
|
39 |
+
19118 174336
|
40 |
+
19121 137472
|
41 |
+
19122 144896
|
42 |
+
19132 131072
|
43 |
+
19138 135936
|
exp/tts_stats_raw_phn_none/logdir/stats.10/train/stats_keys
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
feats
|
2 |
+
feats_lengths
|
exp/tts_stats_raw_phn_none/logdir/stats.10/train/text_shape
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
18935 66
|
2 |
+
18936 77
|
3 |
+
18943 94
|
4 |
+
18944 90
|
5 |
+
18947 93
|
6 |
+
18951 66
|
7 |
+
18955 116
|
8 |
+
18959 120
|
9 |
+
18964 81
|
10 |
+
18982 54
|
11 |
+
18989 85
|
12 |
+
18991 114
|
13 |
+
18993 100
|
14 |
+
18997 45
|
15 |
+
19 58
|
16 |
+
19001 132
|
17 |
+
19005 97
|
18 |
+
19010 82
|
19 |
+
19011 97
|
20 |
+
19015 72
|
21 |
+
19024 90
|
22 |
+
19028 71
|
23 |
+
19063 115
|
24 |
+
19065 84
|
25 |
+
19067 83
|
26 |
+
19075 78
|
27 |
+
19076 112
|
28 |
+
19090 92
|
29 |
+
19091 108
|
30 |
+
19095 62
|
31 |
+
19096 89
|
32 |
+
19099 87
|
33 |
+
191 70
|
34 |
+
19103 68
|
35 |
+
19109 75
|
36 |
+
19111 80
|
37 |
+
19113 45
|
38 |
+
19116 87
|
39 |
+
19118 97
|
40 |
+
19121 74
|
41 |
+
19122 87
|
42 |
+
19132 69
|
43 |
+
19138 75
|
exp/tts_stats_raw_phn_none/logdir/stats.10/valid/batch_keys
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
text
|
2 |
+
speech
|
exp/tts_stats_raw_phn_none/logdir/stats.10/valid/feats_lengths_stats.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9fe87f28f6100dafb92cda513225e57bd983e4483dbefd895ad65790398958c0
|
3 |
+
size 778
|
exp/tts_stats_raw_phn_none/logdir/stats.10/valid/feats_stats.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8c23bf05ba35b7d316b51347290281e31e36aca870887098c995fd8f5c860508
|
3 |
+
size 1402
|
exp/tts_stats_raw_phn_none/logdir/stats.10/valid/speech_shape
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
169 189952
|
2 |
+
18237 234496
|
exp/tts_stats_raw_phn_none/logdir/stats.10/valid/stats_keys
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
feats
|
2 |
+
feats_lengths
|
exp/tts_stats_raw_phn_none/logdir/stats.10/valid/text_shape
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
169 104
|
2 |
+
18237 134
|
exp/tts_stats_raw_phn_none/logdir/stats.11.log
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.11.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.11.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.11 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
|
2 |
+
# Started at Thu Jul 13 14:10:19 UTC 2023
|
3 |
+
#
|
4 |
+
/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
|
5 |
+
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
|
6 |
+
/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.11.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.11.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.11 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
|
7 |
+
[7850374a3496] 2023-07-13 14:10:27,026 (tts:293) INFO: Vocabulary size: 79
|
8 |
+
[7850374a3496] 2023-07-13 14:10:27,731 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
|
9 |
+
[7850374a3496] 2023-07-13 14:10:27,734 (abs_task:1204) INFO: Model structure:
|
10 |
+
ESPnetTTSModel(
|
11 |
+
(feats_extract): LogMelFbank(
|
12 |
+
(stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
|
13 |
+
(logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
|
14 |
+
)
|
15 |
+
(tts): Tacotron2(
|
16 |
+
(enc): Encoder(
|
17 |
+
(embed): Embedding(79, 512, padding_idx=0)
|
18 |
+
(convs): ModuleList(
|
19 |
+
(0-2): 3 x Sequential(
|
20 |
+
(0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
21 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
22 |
+
(2): ReLU()
|
23 |
+
(3): Dropout(p=0.5, inplace=False)
|
24 |
+
)
|
25 |
+
)
|
26 |
+
(blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
|
27 |
+
)
|
28 |
+
(dec): Decoder(
|
29 |
+
(att): AttLoc(
|
30 |
+
(mlp_enc): Linear(in_features=512, out_features=512, bias=True)
|
31 |
+
(mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
|
32 |
+
(mlp_att): Linear(in_features=32, out_features=512, bias=False)
|
33 |
+
(loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
|
34 |
+
(gvec): Linear(in_features=512, out_features=1, bias=True)
|
35 |
+
)
|
36 |
+
(lstm): ModuleList(
|
37 |
+
(0): ZoneOutCell(
|
38 |
+
(cell): LSTMCell(768, 1024)
|
39 |
+
)
|
40 |
+
(1): ZoneOutCell(
|
41 |
+
(cell): LSTMCell(1024, 1024)
|
42 |
+
)
|
43 |
+
)
|
44 |
+
(prenet): Prenet(
|
45 |
+
(prenet): ModuleList(
|
46 |
+
(0): Sequential(
|
47 |
+
(0): Linear(in_features=80, out_features=256, bias=True)
|
48 |
+
(1): ReLU()
|
49 |
+
)
|
50 |
+
(1): Sequential(
|
51 |
+
(0): Linear(in_features=256, out_features=256, bias=True)
|
52 |
+
(1): ReLU()
|
53 |
+
)
|
54 |
+
)
|
55 |
+
)
|
56 |
+
(postnet): Postnet(
|
57 |
+
(postnet): ModuleList(
|
58 |
+
(0): Sequential(
|
59 |
+
(0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
60 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
61 |
+
(2): Tanh()
|
62 |
+
(3): Dropout(p=0.5, inplace=False)
|
63 |
+
)
|
64 |
+
(1-3): 3 x Sequential(
|
65 |
+
(0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
66 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
67 |
+
(2): Tanh()
|
68 |
+
(3): Dropout(p=0.5, inplace=False)
|
69 |
+
)
|
70 |
+
(4): Sequential(
|
71 |
+
(0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
72 |
+
(1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
73 |
+
(2): Dropout(p=0.5, inplace=False)
|
74 |
+
)
|
75 |
+
)
|
76 |
+
)
|
77 |
+
(feat_out): Linear(in_features=1536, out_features=240, bias=False)
|
78 |
+
(prob_out): Linear(in_features=1536, out_features=3, bias=True)
|
79 |
+
)
|
80 |
+
(taco2_loss): Tacotron2Loss(
|
81 |
+
(l1_criterion): L1Loss()
|
82 |
+
(mse_criterion): MSELoss()
|
83 |
+
(bce_criterion): BCEWithLogitsLoss()
|
84 |
+
)
|
85 |
+
(attn_loss): GuidedAttentionLoss()
|
86 |
+
)
|
87 |
+
)
|
88 |
+
|
89 |
+
Model summary:
|
90 |
+
Class Name: ESPnetTTSModel
|
91 |
+
Total Number of model parameters: 26.91 M
|
92 |
+
Number of trainable parameters: 26.91 M (100.0%)
|
93 |
+
Size: 107.63 MB
|
94 |
+
Type: torch.float32
|
95 |
+
[7850374a3496] 2023-07-13 14:10:27,734 (abs_task:1207) INFO: Optimizer:
|
96 |
+
Adam (
|
97 |
+
Parameter Group 0
|
98 |
+
amsgrad: False
|
99 |
+
betas: (0.9, 0.999)
|
100 |
+
capturable: False
|
101 |
+
differentiable: False
|
102 |
+
eps: 1e-06
|
103 |
+
foreach: None
|
104 |
+
fused: None
|
105 |
+
lr: 0.001
|
106 |
+
maximize: False
|
107 |
+
weight_decay: 0.0
|
108 |
+
)
|
109 |
+
[7850374a3496] 2023-07-13 14:10:27,734 (abs_task:1208) INFO: Scheduler: None
|
110 |
+
[7850374a3496] 2023-07-13 14:10:27,734 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.11/config.yaml
|
111 |
+
[7850374a3496] 2023-07-13 14:10:27,761 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.11', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.11.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.11.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
|
112 |
+
/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
|
113 |
+
Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
|
114 |
+
return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined]
|
115 |
+
# Accounting: time=12 threads=1
|
116 |
+
# Ended (code 0) at Thu Jul 13 14:10:31 UTC 2023, elapsed time 12 seconds
|
exp/tts_stats_raw_phn_none/logdir/stats.12.log
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.12.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.12.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.12 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
|
2 |
+
# Started at Thu Jul 13 14:10:19 UTC 2023
|
3 |
+
#
|
4 |
+
/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
|
5 |
+
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
|
6 |
+
/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.12.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.12.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.12 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
|
7 |
+
[7850374a3496] 2023-07-13 14:10:27,287 (tts:293) INFO: Vocabulary size: 79
|
8 |
+
[7850374a3496] 2023-07-13 14:10:27,998 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
|
9 |
+
[7850374a3496] 2023-07-13 14:10:28,001 (abs_task:1204) INFO: Model structure:
|
10 |
+
ESPnetTTSModel(
|
11 |
+
(feats_extract): LogMelFbank(
|
12 |
+
(stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
|
13 |
+
(logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
|
14 |
+
)
|
15 |
+
(tts): Tacotron2(
|
16 |
+
(enc): Encoder(
|
17 |
+
(embed): Embedding(79, 512, padding_idx=0)
|
18 |
+
(convs): ModuleList(
|
19 |
+
(0-2): 3 x Sequential(
|
20 |
+
(0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
21 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
22 |
+
(2): ReLU()
|
23 |
+
(3): Dropout(p=0.5, inplace=False)
|
24 |
+
)
|
25 |
+
)
|
26 |
+
(blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
|
27 |
+
)
|
28 |
+
(dec): Decoder(
|
29 |
+
(att): AttLoc(
|
30 |
+
(mlp_enc): Linear(in_features=512, out_features=512, bias=True)
|
31 |
+
(mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
|
32 |
+
(mlp_att): Linear(in_features=32, out_features=512, bias=False)
|
33 |
+
(loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
|
34 |
+
(gvec): Linear(in_features=512, out_features=1, bias=True)
|
35 |
+
)
|
36 |
+
(lstm): ModuleList(
|
37 |
+
(0): ZoneOutCell(
|
38 |
+
(cell): LSTMCell(768, 1024)
|
39 |
+
)
|
40 |
+
(1): ZoneOutCell(
|
41 |
+
(cell): LSTMCell(1024, 1024)
|
42 |
+
)
|
43 |
+
)
|
44 |
+
(prenet): Prenet(
|
45 |
+
(prenet): ModuleList(
|
46 |
+
(0): Sequential(
|
47 |
+
(0): Linear(in_features=80, out_features=256, bias=True)
|
48 |
+
(1): ReLU()
|
49 |
+
)
|
50 |
+
(1): Sequential(
|
51 |
+
(0): Linear(in_features=256, out_features=256, bias=True)
|
52 |
+
(1): ReLU()
|
53 |
+
)
|
54 |
+
)
|
55 |
+
)
|
56 |
+
(postnet): Postnet(
|
57 |
+
(postnet): ModuleList(
|
58 |
+
(0): Sequential(
|
59 |
+
(0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
60 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
61 |
+
(2): Tanh()
|
62 |
+
(3): Dropout(p=0.5, inplace=False)
|
63 |
+
)
|
64 |
+
(1-3): 3 x Sequential(
|
65 |
+
(0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
66 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
67 |
+
(2): Tanh()
|
68 |
+
(3): Dropout(p=0.5, inplace=False)
|
69 |
+
)
|
70 |
+
(4): Sequential(
|
71 |
+
(0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
72 |
+
(1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
73 |
+
(2): Dropout(p=0.5, inplace=False)
|
74 |
+
)
|
75 |
+
)
|
76 |
+
)
|
77 |
+
(feat_out): Linear(in_features=1536, out_features=240, bias=False)
|
78 |
+
(prob_out): Linear(in_features=1536, out_features=3, bias=True)
|
79 |
+
)
|
80 |
+
(taco2_loss): Tacotron2Loss(
|
81 |
+
(l1_criterion): L1Loss()
|
82 |
+
(mse_criterion): MSELoss()
|
83 |
+
(bce_criterion): BCEWithLogitsLoss()
|
84 |
+
)
|
85 |
+
(attn_loss): GuidedAttentionLoss()
|
86 |
+
)
|
87 |
+
)
|
88 |
+
|
89 |
+
Model summary:
|
90 |
+
Class Name: ESPnetTTSModel
|
91 |
+
Total Number of model parameters: 26.91 M
|
92 |
+
Number of trainable parameters: 26.91 M (100.0%)
|
93 |
+
Size: 107.63 MB
|
94 |
+
Type: torch.float32
|
95 |
+
[7850374a3496] 2023-07-13 14:10:28,001 (abs_task:1207) INFO: Optimizer:
|
96 |
+
Adam (
|
97 |
+
Parameter Group 0
|
98 |
+
amsgrad: False
|
99 |
+
betas: (0.9, 0.999)
|
100 |
+
capturable: False
|
101 |
+
differentiable: False
|
102 |
+
eps: 1e-06
|
103 |
+
foreach: None
|
104 |
+
fused: None
|
105 |
+
lr: 0.001
|
106 |
+
maximize: False
|
107 |
+
weight_decay: 0.0
|
108 |
+
)
|
109 |
+
[7850374a3496] 2023-07-13 14:10:28,001 (abs_task:1208) INFO: Scheduler: None
|
110 |
+
[7850374a3496] 2023-07-13 14:10:28,001 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.12/config.yaml
|
111 |
+
[7850374a3496] 2023-07-13 14:10:28,024 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.12', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.12.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.12.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
|
112 |
+
/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
|
113 |
+
Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
|
114 |
+
return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined]
|
115 |
+
# Accounting: time=13 threads=1
|
116 |
+
# Ended (code 0) at Thu Jul 13 14:10:32 UTC 2023, elapsed time 13 seconds
|
exp/tts_stats_raw_phn_none/logdir/stats.12/config.yaml
ADDED
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
config: conf/tuning/finetune_tacotron2.yaml
|
2 |
+
print_config: false
|
3 |
+
log_level: INFO
|
4 |
+
dry_run: false
|
5 |
+
iterator_type: sequence
|
6 |
+
output_dir: exp/tts_stats_raw_phn_none/logdir/stats.12
|
7 |
+
ngpu: 0
|
8 |
+
seed: 0
|
9 |
+
num_workers: 1
|
10 |
+
num_att_plot: 3
|
11 |
+
dist_backend: nccl
|
12 |
+
dist_init_method: env://
|
13 |
+
dist_world_size: null
|
14 |
+
dist_rank: null
|
15 |
+
local_rank: null
|
16 |
+
dist_master_addr: null
|
17 |
+
dist_master_port: null
|
18 |
+
dist_launcher: null
|
19 |
+
multiprocessing_distributed: false
|
20 |
+
unused_parameters: false
|
21 |
+
sharded_ddp: false
|
22 |
+
cudnn_enabled: true
|
23 |
+
cudnn_benchmark: false
|
24 |
+
cudnn_deterministic: true
|
25 |
+
collect_stats: true
|
26 |
+
write_collected_feats: false
|
27 |
+
max_epoch: 120
|
28 |
+
patience: null
|
29 |
+
val_scheduler_criterion:
|
30 |
+
- valid
|
31 |
+
- loss
|
32 |
+
early_stopping_criterion:
|
33 |
+
- valid
|
34 |
+
- loss
|
35 |
+
- min
|
36 |
+
best_model_criterion:
|
37 |
+
- - valid
|
38 |
+
- loss
|
39 |
+
- min
|
40 |
+
- - train
|
41 |
+
- loss
|
42 |
+
- min
|
43 |
+
keep_nbest_models: 5
|
44 |
+
nbest_averaging_interval: 0
|
45 |
+
grad_clip: 1.0
|
46 |
+
grad_clip_type: 2.0
|
47 |
+
grad_noise: false
|
48 |
+
accum_grad: 1
|
49 |
+
no_forward_run: false
|
50 |
+
resume: false
|
51 |
+
train_dtype: float32
|
52 |
+
use_amp: false
|
53 |
+
log_interval: null
|
54 |
+
use_matplotlib: true
|
55 |
+
use_tensorboard: true
|
56 |
+
create_graph_in_tensorboard: false
|
57 |
+
use_wandb: false
|
58 |
+
wandb_project: null
|
59 |
+
wandb_id: null
|
60 |
+
wandb_entity: null
|
61 |
+
wandb_name: null
|
62 |
+
wandb_model_log_interval: -1
|
63 |
+
detect_anomaly: false
|
64 |
+
pretrain_path: null
|
65 |
+
init_param: []
|
66 |
+
ignore_init_mismatch: false
|
67 |
+
freeze_param: []
|
68 |
+
num_iters_per_epoch: 200
|
69 |
+
batch_size: 20
|
70 |
+
valid_batch_size: null
|
71 |
+
batch_bins: 1600000
|
72 |
+
valid_batch_bins: null
|
73 |
+
train_shape_file:
|
74 |
+
- exp/tts_stats_raw_phn_none/logdir/train.12.scp
|
75 |
+
valid_shape_file:
|
76 |
+
- exp/tts_stats_raw_phn_none/logdir/valid.12.scp
|
77 |
+
batch_type: numel
|
78 |
+
valid_batch_type: null
|
79 |
+
fold_length: []
|
80 |
+
sort_in_batch: descending
|
81 |
+
sort_batch: descending
|
82 |
+
multiple_iterator: false
|
83 |
+
chunk_length: 500
|
84 |
+
chunk_shift_ratio: 0.5
|
85 |
+
num_cache_chunks: 1024
|
86 |
+
chunk_excluded_key_prefixes: []
|
87 |
+
train_data_path_and_name_and_type:
|
88 |
+
- - dump/raw/train/text
|
89 |
+
- text
|
90 |
+
- text
|
91 |
+
- - dump/raw/train/wav.scp
|
92 |
+
- speech
|
93 |
+
- sound
|
94 |
+
valid_data_path_and_name_and_type:
|
95 |
+
- - dump/raw/dev/text
|
96 |
+
- text
|
97 |
+
- text
|
98 |
+
- - dump/raw/dev/wav.scp
|
99 |
+
- speech
|
100 |
+
- sound
|
101 |
+
allow_variable_data_keys: false
|
102 |
+
max_cache_size: 0.0
|
103 |
+
max_cache_fd: 32
|
104 |
+
valid_max_cache_size: null
|
105 |
+
exclude_weight_decay: false
|
106 |
+
exclude_weight_decay_conf: {}
|
107 |
+
optim: adam
|
108 |
+
optim_conf:
|
109 |
+
lr: 0.001
|
110 |
+
eps: 1.0e-06
|
111 |
+
weight_decay: 0.0
|
112 |
+
scheduler: null
|
113 |
+
scheduler_conf: {}
|
114 |
+
token_list:
|
115 |
+
- <blank>
|
116 |
+
- <unk>
|
117 |
+
- a
|
118 |
+
- sil
|
119 |
+
- l
|
120 |
+
- aa
|
121 |
+
- m
|
122 |
+
- ii0
|
123 |
+
- t
|
124 |
+
- <
|
125 |
+
- n
|
126 |
+
- r
|
127 |
+
- E
|
128 |
+
- i0
|
129 |
+
- b
|
130 |
+
- uu0
|
131 |
+
- f
|
132 |
+
- i1
|
133 |
+
- k
|
134 |
+
- w
|
135 |
+
- A
|
136 |
+
- s
|
137 |
+
- y
|
138 |
+
- d
|
139 |
+
- q
|
140 |
+
- h
|
141 |
+
- H
|
142 |
+
- $
|
143 |
+
- u0
|
144 |
+
- AA
|
145 |
+
- j
|
146 |
+
- T
|
147 |
+
- x
|
148 |
+
- S
|
149 |
+
- z
|
150 |
+
- ll
|
151 |
+
- I1
|
152 |
+
- D
|
153 |
+
- II0
|
154 |
+
- g
|
155 |
+
- tt
|
156 |
+
- rr
|
157 |
+
- I0
|
158 |
+
- UU0
|
159 |
+
- dd
|
160 |
+
- u1
|
161 |
+
- U0
|
162 |
+
- mm
|
163 |
+
- nn
|
164 |
+
- '*'
|
165 |
+
- $$
|
166 |
+
- bb
|
167 |
+
- yy
|
168 |
+
- ss
|
169 |
+
- jj
|
170 |
+
- ww
|
171 |
+
- ^
|
172 |
+
- SS
|
173 |
+
- TT
|
174 |
+
- Z
|
175 |
+
- zz
|
176 |
+
- kk
|
177 |
+
- U1
|
178 |
+
- HH
|
179 |
+
- ff
|
180 |
+
- qq
|
181 |
+
- xx
|
182 |
+
- ^^
|
183 |
+
- DD
|
184 |
+
- hh
|
185 |
+
- EE
|
186 |
+
- ZZ
|
187 |
+
- '**'
|
188 |
+
- aaaa
|
189 |
+
- ssss
|
190 |
+
- v
|
191 |
+
- uu1
|
192 |
+
- jjjj
|
193 |
+
- <sos/eos>
|
194 |
+
odim: null
|
195 |
+
model_conf: {}
|
196 |
+
use_preprocessor: true
|
197 |
+
token_type: phn
|
198 |
+
bpemodel: null
|
199 |
+
non_linguistic_symbols: null
|
200 |
+
cleaner: null
|
201 |
+
g2p: null
|
202 |
+
feats_extract: fbank
|
203 |
+
feats_extract_conf:
|
204 |
+
n_fft: 1024
|
205 |
+
hop_length: 256
|
206 |
+
win_length: null
|
207 |
+
fs: 22050
|
208 |
+
fmin: 80
|
209 |
+
fmax: 7600
|
210 |
+
n_mels: 80
|
211 |
+
normalize: null
|
212 |
+
normalize_conf: {}
|
213 |
+
tts: tacotron2
|
214 |
+
tts_conf:
|
215 |
+
embed_dim: 512
|
216 |
+
elayers: 1
|
217 |
+
eunits: 512
|
218 |
+
econv_layers: 3
|
219 |
+
econv_chans: 512
|
220 |
+
econv_filts: 5
|
221 |
+
atype: location
|
222 |
+
adim: 512
|
223 |
+
aconv_chans: 32
|
224 |
+
aconv_filts: 15
|
225 |
+
cumulate_att_w: true
|
226 |
+
dlayers: 2
|
227 |
+
dunits: 1024
|
228 |
+
prenet_layers: 2
|
229 |
+
prenet_units: 256
|
230 |
+
postnet_layers: 5
|
231 |
+
postnet_chans: 512
|
232 |
+
postnet_filts: 5
|
233 |
+
output_activation: null
|
234 |
+
use_batch_norm: true
|
235 |
+
use_concate: true
|
236 |
+
use_residual: false
|
237 |
+
dropout_rate: 0.5
|
238 |
+
zoneout_rate: 0.1
|
239 |
+
reduction_factor: 3
|
240 |
+
spk_embed_dim: null
|
241 |
+
use_masking: true
|
242 |
+
bce_pos_weight: 20.0
|
243 |
+
use_guided_attn_loss: true
|
244 |
+
guided_attn_loss_sigma: 0.4
|
245 |
+
guided_attn_loss_lambda: 1.0
|
246 |
+
pitch_extract: null
|
247 |
+
pitch_extract_conf:
|
248 |
+
fs: 22050
|
249 |
+
n_fft: 1024
|
250 |
+
hop_length: 256
|
251 |
+
f0max: 400
|
252 |
+
f0min: 80
|
253 |
+
pitch_normalize: null
|
254 |
+
pitch_normalize_conf: {}
|
255 |
+
energy_extract: null
|
256 |
+
energy_extract_conf:
|
257 |
+
fs: 22050
|
258 |
+
n_fft: 1024
|
259 |
+
hop_length: 256
|
260 |
+
win_length: null
|
261 |
+
energy_normalize: null
|
262 |
+
energy_normalize_conf: {}
|
263 |
+
required:
|
264 |
+
- output_dir
|
265 |
+
- token_list
|
266 |
+
version: '202304'
|
267 |
+
distributed: false
|
exp/tts_stats_raw_phn_none/logdir/stats.12/train/batch_keys
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
text
|
2 |
+
speech
|
exp/tts_stats_raw_phn_none/logdir/stats.12/train/feats_lengths_stats.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d49bfea4033ce1e51b1a17d023326b9c8fc5b58658ad92a4ab13fae6f7b8d624
|
3 |
+
size 778
|
exp/tts_stats_raw_phn_none/logdir/stats.12/train/feats_stats.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:75c6bcb5409152fe06dfbb367a0d796774c6f1e94af5fa448137f8a901f9c284
|
3 |
+
size 1402
|
exp/tts_stats_raw_phn_none/logdir/stats.12/train/speech_shape
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
19360 171776
|
2 |
+
19366 143104
|
3 |
+
19367 199936
|
4 |
+
19371 145920
|
5 |
+
19372 162816
|
6 |
+
19374 145664
|
7 |
+
19376 201682
|
8 |
+
19387 219904
|
9 |
+
19396 130048
|
10 |
+
19399 112896
|
11 |
+
194 140032
|
12 |
+
19400 183808
|
13 |
+
19404 159488
|
14 |
+
19406 186624
|
15 |
+
19410 183552
|
16 |
+
19413 121088
|
17 |
+
19414 134912
|
18 |
+
19423 198400
|
19 |
+
19429 195328
|
20 |
+
19439 114944
|
21 |
+
19440 97280
|
22 |
+
19449 159488
|
23 |
+
19451 140032
|
24 |
+
19454 120320
|
25 |
+
19477 191488
|
26 |
+
19482 157696
|
27 |
+
19488 169472
|
28 |
+
19496 129792
|
29 |
+
19499 153344
|
30 |
+
195 122624
|
31 |
+
19501 137216
|
32 |
+
19506 162816
|
33 |
+
19509 143872
|
34 |
+
19510 119040
|
35 |
+
19511 146688
|
36 |
+
19521 132864
|
37 |
+
19522 167680
|
38 |
+
19524 146944
|
39 |
+
19529 188928
|
40 |
+
19540 193536
|
41 |
+
19542 179456
|
42 |
+
19543 159669
|
43 |
+
19548 138752
|
exp/tts_stats_raw_phn_none/logdir/stats.12/train/stats_keys
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
feats
|
2 |
+
feats_lengths
|
exp/tts_stats_raw_phn_none/logdir/stats.12/train/text_shape
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
19360 89
|
2 |
+
19366 85
|
3 |
+
19367 92
|
4 |
+
19371 64
|
5 |
+
19372 84
|
6 |
+
19374 75
|
7 |
+
19376 103
|
8 |
+
19387 108
|
9 |
+
19396 57
|
10 |
+
19399 54
|
11 |
+
194 62
|
12 |
+
19400 92
|
13 |
+
19404 99
|
14 |
+
19406 103
|
15 |
+
19410 102
|
16 |
+
19413 67
|
17 |
+
19414 60
|
18 |
+
19423 90
|
19 |
+
19429 90
|
20 |
+
19439 51
|
21 |
+
19440 33
|
22 |
+
19449 92
|
23 |
+
19451 67
|
24 |
+
19454 61
|
25 |
+
19477 97
|
26 |
+
19482 84
|
27 |
+
19488 93
|
28 |
+
19496 63
|
29 |
+
19499 65
|
30 |
+
195 68
|
31 |
+
19501 60
|
32 |
+
19506 96
|
33 |
+
19509 80
|
34 |
+
19510 57
|
35 |
+
19511 77
|
36 |
+
19521 61
|
37 |
+
19522 80
|
38 |
+
19524 67
|
39 |
+
19529 91
|
40 |
+
19540 101
|
41 |
+
19542 107
|
42 |
+
19543 78
|
43 |
+
19548 61
|
exp/tts_stats_raw_phn_none/logdir/stats.12/valid/batch_keys
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
text
|
2 |
+
speech
|
exp/tts_stats_raw_phn_none/logdir/stats.12/valid/feats_lengths_stats.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f504865400934158e457d1520e847de7701d6bd8479c772a7d9710d35616c234
|
3 |
+
size 778
|
exp/tts_stats_raw_phn_none/logdir/stats.12/valid/feats_stats.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:51eb4c28274a7e81976f604f330f3c2f10cc6cf6a8befcf261e3e49cbdd44ab0
|
3 |
+
size 1402
|
exp/tts_stats_raw_phn_none/logdir/stats.12/valid/speech_shape
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
18963 129280
|
2 |
+
19178 177408
|
exp/tts_stats_raw_phn_none/logdir/stats.12/valid/stats_keys
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
feats
|
2 |
+
feats_lengths
|
exp/tts_stats_raw_phn_none/logdir/stats.12/valid/text_shape
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
18963 58
|
2 |
+
19178 91
|
exp/tts_stats_raw_phn_none/logdir/stats.13.log
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.13.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.13.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.13 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
|
2 |
+
# Started at Thu Jul 13 14:10:31 UTC 2023
|
3 |
+
#
|
4 |
+
/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
|
5 |
+
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
|
6 |
+
/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.13.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.13.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.13 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
|
7 |
+
[7850374a3496] 2023-07-13 14:10:40,144 (tts:293) INFO: Vocabulary size: 79
|
8 |
+
[7850374a3496] 2023-07-13 14:10:40,861 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
|
9 |
+
[7850374a3496] 2023-07-13 14:10:40,864 (abs_task:1204) INFO: Model structure:
|
10 |
+
ESPnetTTSModel(
|
11 |
+
(feats_extract): LogMelFbank(
|
12 |
+
(stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
|
13 |
+
(logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
|
14 |
+
)
|
15 |
+
(tts): Tacotron2(
|
16 |
+
(enc): Encoder(
|
17 |
+
(embed): Embedding(79, 512, padding_idx=0)
|
18 |
+
(convs): ModuleList(
|
19 |
+
(0-2): 3 x Sequential(
|
20 |
+
(0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
21 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
22 |
+
(2): ReLU()
|
23 |
+
(3): Dropout(p=0.5, inplace=False)
|
24 |
+
)
|
25 |
+
)
|
26 |
+
(blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
|
27 |
+
)
|
28 |
+
(dec): Decoder(
|
29 |
+
(att): AttLoc(
|
30 |
+
(mlp_enc): Linear(in_features=512, out_features=512, bias=True)
|
31 |
+
(mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
|
32 |
+
(mlp_att): Linear(in_features=32, out_features=512, bias=False)
|
33 |
+
(loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
|
34 |
+
(gvec): Linear(in_features=512, out_features=1, bias=True)
|
35 |
+
)
|
36 |
+
(lstm): ModuleList(
|
37 |
+
(0): ZoneOutCell(
|
38 |
+
(cell): LSTMCell(768, 1024)
|
39 |
+
)
|
40 |
+
(1): ZoneOutCell(
|
41 |
+
(cell): LSTMCell(1024, 1024)
|
42 |
+
)
|
43 |
+
)
|
44 |
+
(prenet): Prenet(
|
45 |
+
(prenet): ModuleList(
|
46 |
+
(0): Sequential(
|
47 |
+
(0): Linear(in_features=80, out_features=256, bias=True)
|
48 |
+
(1): ReLU()
|
49 |
+
)
|
50 |
+
(1): Sequential(
|
51 |
+
(0): Linear(in_features=256, out_features=256, bias=True)
|
52 |
+
(1): ReLU()
|
53 |
+
)
|
54 |
+
)
|
55 |
+
)
|
56 |
+
(postnet): Postnet(
|
57 |
+
(postnet): ModuleList(
|
58 |
+
(0): Sequential(
|
59 |
+
(0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
60 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
61 |
+
(2): Tanh()
|
62 |
+
(3): Dropout(p=0.5, inplace=False)
|
63 |
+
)
|
64 |
+
(1-3): 3 x Sequential(
|
65 |
+
(0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
66 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
67 |
+
(2): Tanh()
|
68 |
+
(3): Dropout(p=0.5, inplace=False)
|
69 |
+
)
|
70 |
+
(4): Sequential(
|
71 |
+
(0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
72 |
+
(1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
73 |
+
(2): Dropout(p=0.5, inplace=False)
|
74 |
+
)
|
75 |
+
)
|
76 |
+
)
|
77 |
+
(feat_out): Linear(in_features=1536, out_features=240, bias=False)
|
78 |
+
(prob_out): Linear(in_features=1536, out_features=3, bias=True)
|
79 |
+
)
|
80 |
+
(taco2_loss): Tacotron2Loss(
|
81 |
+
(l1_criterion): L1Loss()
|
82 |
+
(mse_criterion): MSELoss()
|
83 |
+
(bce_criterion): BCEWithLogitsLoss()
|
84 |
+
)
|
85 |
+
(attn_loss): GuidedAttentionLoss()
|
86 |
+
)
|
87 |
+
)
|
88 |
+
|
89 |
+
Model summary:
|
90 |
+
Class Name: ESPnetTTSModel
|
91 |
+
Total Number of model parameters: 26.91 M
|
92 |
+
Number of trainable parameters: 26.91 M (100.0%)
|
93 |
+
Size: 107.63 MB
|
94 |
+
Type: torch.float32
|
95 |
+
[7850374a3496] 2023-07-13 14:10:40,864 (abs_task:1207) INFO: Optimizer:
|
96 |
+
Adam (
|
97 |
+
Parameter Group 0
|
98 |
+
amsgrad: False
|
99 |
+
betas: (0.9, 0.999)
|
100 |
+
capturable: False
|
101 |
+
differentiable: False
|
102 |
+
eps: 1e-06
|
103 |
+
foreach: None
|
104 |
+
fused: None
|
105 |
+
lr: 0.001
|
106 |
+
maximize: False
|
107 |
+
weight_decay: 0.0
|
108 |
+
)
|
109 |
+
[7850374a3496] 2023-07-13 14:10:40,864 (abs_task:1208) INFO: Scheduler: None
|
110 |
+
[7850374a3496] 2023-07-13 14:10:40,864 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.13/config.yaml
|
111 |
+
[7850374a3496] 2023-07-13 14:10:40,891 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.13', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.13.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.13.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
|
112 |
+
/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
|
113 |
+
Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
|
114 |
+
return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined]
|
115 |
+
# Accounting: time=13 threads=1
|
116 |
+
# Ended (code 0) at Thu Jul 13 14:10:44 UTC 2023, elapsed time 13 seconds
|
exp/tts_stats_raw_phn_none/logdir/stats.14.log
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.14.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.14.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.14 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
|
2 |
+
# Started at Thu Jul 13 14:10:32 UTC 2023
|
3 |
+
#
|
4 |
+
/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
|
5 |
+
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
|
6 |
+
/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.14.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.14.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.14 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
|
7 |
+
[7850374a3496] 2023-07-13 14:10:40,319 (tts:293) INFO: Vocabulary size: 79
|
8 |
+
[7850374a3496] 2023-07-13 14:10:41,034 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
|
9 |
+
[7850374a3496] 2023-07-13 14:10:41,037 (abs_task:1204) INFO: Model structure:
|
10 |
+
ESPnetTTSModel(
|
11 |
+
(feats_extract): LogMelFbank(
|
12 |
+
(stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
|
13 |
+
(logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
|
14 |
+
)
|
15 |
+
(tts): Tacotron2(
|
16 |
+
(enc): Encoder(
|
17 |
+
(embed): Embedding(79, 512, padding_idx=0)
|
18 |
+
(convs): ModuleList(
|
19 |
+
(0-2): 3 x Sequential(
|
20 |
+
(0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
21 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
22 |
+
(2): ReLU()
|
23 |
+
(3): Dropout(p=0.5, inplace=False)
|
24 |
+
)
|
25 |
+
)
|
26 |
+
(blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
|
27 |
+
)
|
28 |
+
(dec): Decoder(
|
29 |
+
(att): AttLoc(
|
30 |
+
(mlp_enc): Linear(in_features=512, out_features=512, bias=True)
|
31 |
+
(mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
|
32 |
+
(mlp_att): Linear(in_features=32, out_features=512, bias=False)
|
33 |
+
(loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
|
34 |
+
(gvec): Linear(in_features=512, out_features=1, bias=True)
|
35 |
+
)
|
36 |
+
(lstm): ModuleList(
|
37 |
+
(0): ZoneOutCell(
|
38 |
+
(cell): LSTMCell(768, 1024)
|
39 |
+
)
|
40 |
+
(1): ZoneOutCell(
|
41 |
+
(cell): LSTMCell(1024, 1024)
|
42 |
+
)
|
43 |
+
)
|
44 |
+
(prenet): Prenet(
|
45 |
+
(prenet): ModuleList(
|
46 |
+
(0): Sequential(
|
47 |
+
(0): Linear(in_features=80, out_features=256, bias=True)
|
48 |
+
(1): ReLU()
|
49 |
+
)
|
50 |
+
(1): Sequential(
|
51 |
+
(0): Linear(in_features=256, out_features=256, bias=True)
|
52 |
+
(1): ReLU()
|
53 |
+
)
|
54 |
+
)
|
55 |
+
)
|
56 |
+
(postnet): Postnet(
|
57 |
+
(postnet): ModuleList(
|
58 |
+
(0): Sequential(
|
59 |
+
(0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
60 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
61 |
+
(2): Tanh()
|
62 |
+
(3): Dropout(p=0.5, inplace=False)
|
63 |
+
)
|
64 |
+
(1-3): 3 x Sequential(
|
65 |
+
(0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
66 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
67 |
+
(2): Tanh()
|
68 |
+
(3): Dropout(p=0.5, inplace=False)
|
69 |
+
)
|
70 |
+
(4): Sequential(
|
71 |
+
(0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
72 |
+
(1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
73 |
+
(2): Dropout(p=0.5, inplace=False)
|
74 |
+
)
|
75 |
+
)
|
76 |
+
)
|
77 |
+
(feat_out): Linear(in_features=1536, out_features=240, bias=False)
|
78 |
+
(prob_out): Linear(in_features=1536, out_features=3, bias=True)
|
79 |
+
)
|
80 |
+
(taco2_loss): Tacotron2Loss(
|
81 |
+
(l1_criterion): L1Loss()
|
82 |
+
(mse_criterion): MSELoss()
|
83 |
+
(bce_criterion): BCEWithLogitsLoss()
|
84 |
+
)
|
85 |
+
(attn_loss): GuidedAttentionLoss()
|
86 |
+
)
|
87 |
+
)
|
88 |
+
|
89 |
+
Model summary:
|
90 |
+
Class Name: ESPnetTTSModel
|
91 |
+
Total Number of model parameters: 26.91 M
|
92 |
+
Number of trainable parameters: 26.91 M (100.0%)
|
93 |
+
Size: 107.63 MB
|
94 |
+
Type: torch.float32
|
95 |
+
[7850374a3496] 2023-07-13 14:10:41,037 (abs_task:1207) INFO: Optimizer:
|
96 |
+
Adam (
|
97 |
+
Parameter Group 0
|
98 |
+
amsgrad: False
|
99 |
+
betas: (0.9, 0.999)
|
100 |
+
capturable: False
|
101 |
+
differentiable: False
|
102 |
+
eps: 1e-06
|
103 |
+
foreach: None
|
104 |
+
fused: None
|
105 |
+
lr: 0.001
|
106 |
+
maximize: False
|
107 |
+
weight_decay: 0.0
|
108 |
+
)
|
109 |
+
[7850374a3496] 2023-07-13 14:10:41,037 (abs_task:1208) INFO: Scheduler: None
|
110 |
+
[7850374a3496] 2023-07-13 14:10:41,037 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.14/config.yaml
|
111 |
+
[7850374a3496] 2023-07-13 14:10:41,061 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.14', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.14.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.14.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
|
112 |
+
/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
|
113 |
+
Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
|
114 |
+
return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined]
|
115 |
+
# Accounting: time=13 threads=1
|
116 |
+
# Ended (code 0) at Thu Jul 13 14:10:45 UTC 2023, elapsed time 13 seconds
|
exp/tts_stats_raw_phn_none/logdir/stats.15.log
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.15.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.15.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.15 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
|
2 |
+
# Started at Thu Jul 13 14:10:44 UTC 2023
|
3 |
+
#
|
4 |
+
/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
|
5 |
+
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
|
6 |
+
/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.15.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.15.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.15 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
|
7 |
+
[7850374a3496] 2023-07-13 14:10:52,286 (tts:293) INFO: Vocabulary size: 79
|
8 |
+
[7850374a3496] 2023-07-13 14:10:52,979 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
|
9 |
+
[7850374a3496] 2023-07-13 14:10:52,982 (abs_task:1204) INFO: Model structure:
|
10 |
+
ESPnetTTSModel(
|
11 |
+
(feats_extract): LogMelFbank(
|
12 |
+
(stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
|
13 |
+
(logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
|
14 |
+
)
|
15 |
+
(tts): Tacotron2(
|
16 |
+
(enc): Encoder(
|
17 |
+
(embed): Embedding(79, 512, padding_idx=0)
|
18 |
+
(convs): ModuleList(
|
19 |
+
(0-2): 3 x Sequential(
|
20 |
+
(0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
21 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
22 |
+
(2): ReLU()
|
23 |
+
(3): Dropout(p=0.5, inplace=False)
|
24 |
+
)
|
25 |
+
)
|
26 |
+
(blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
|
27 |
+
)
|
28 |
+
(dec): Decoder(
|
29 |
+
(att): AttLoc(
|
30 |
+
(mlp_enc): Linear(in_features=512, out_features=512, bias=True)
|
31 |
+
(mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
|
32 |
+
(mlp_att): Linear(in_features=32, out_features=512, bias=False)
|
33 |
+
(loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
|
34 |
+
(gvec): Linear(in_features=512, out_features=1, bias=True)
|
35 |
+
)
|
36 |
+
(lstm): ModuleList(
|
37 |
+
(0): ZoneOutCell(
|
38 |
+
(cell): LSTMCell(768, 1024)
|
39 |
+
)
|
40 |
+
(1): ZoneOutCell(
|
41 |
+
(cell): LSTMCell(1024, 1024)
|
42 |
+
)
|
43 |
+
)
|
44 |
+
(prenet): Prenet(
|
45 |
+
(prenet): ModuleList(
|
46 |
+
(0): Sequential(
|
47 |
+
(0): Linear(in_features=80, out_features=256, bias=True)
|
48 |
+
(1): ReLU()
|
49 |
+
)
|
50 |
+
(1): Sequential(
|
51 |
+
(0): Linear(in_features=256, out_features=256, bias=True)
|
52 |
+
(1): ReLU()
|
53 |
+
)
|
54 |
+
)
|
55 |
+
)
|
56 |
+
(postnet): Postnet(
|
57 |
+
(postnet): ModuleList(
|
58 |
+
(0): Sequential(
|
59 |
+
(0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
60 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
61 |
+
(2): Tanh()
|
62 |
+
(3): Dropout(p=0.5, inplace=False)
|
63 |
+
)
|
64 |
+
(1-3): 3 x Sequential(
|
65 |
+
(0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
66 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
67 |
+
(2): Tanh()
|
68 |
+
(3): Dropout(p=0.5, inplace=False)
|
69 |
+
)
|
70 |
+
(4): Sequential(
|
71 |
+
(0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
72 |
+
(1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
73 |
+
(2): Dropout(p=0.5, inplace=False)
|
74 |
+
)
|
75 |
+
)
|
76 |
+
)
|
77 |
+
(feat_out): Linear(in_features=1536, out_features=240, bias=False)
|
78 |
+
(prob_out): Linear(in_features=1536, out_features=3, bias=True)
|
79 |
+
)
|
80 |
+
(taco2_loss): Tacotron2Loss(
|
81 |
+
(l1_criterion): L1Loss()
|
82 |
+
(mse_criterion): MSELoss()
|
83 |
+
(bce_criterion): BCEWithLogitsLoss()
|
84 |
+
)
|
85 |
+
(attn_loss): GuidedAttentionLoss()
|
86 |
+
)
|
87 |
+
)
|
88 |
+
|
89 |
+
Model summary:
|
90 |
+
Class Name: ESPnetTTSModel
|
91 |
+
Total Number of model parameters: 26.91 M
|
92 |
+
Number of trainable parameters: 26.91 M (100.0%)
|
93 |
+
Size: 107.63 MB
|
94 |
+
Type: torch.float32
|
95 |
+
[7850374a3496] 2023-07-13 14:10:52,982 (abs_task:1207) INFO: Optimizer:
|
96 |
+
Adam (
|
97 |
+
Parameter Group 0
|
98 |
+
amsgrad: False
|
99 |
+
betas: (0.9, 0.999)
|
100 |
+
capturable: False
|
101 |
+
differentiable: False
|
102 |
+
eps: 1e-06
|
103 |
+
foreach: None
|
104 |
+
fused: None
|
105 |
+
lr: 0.001
|
106 |
+
maximize: False
|
107 |
+
weight_decay: 0.0
|
108 |
+
)
|
109 |
+
[7850374a3496] 2023-07-13 14:10:52,982 (abs_task:1208) INFO: Scheduler: None
|
110 |
+
[7850374a3496] 2023-07-13 14:10:52,982 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.15/config.yaml
|
111 |
+
[7850374a3496] 2023-07-13 14:10:53,009 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.15', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.15.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.15.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
|
112 |
+
/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
|
113 |
+
Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
|
114 |
+
return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined]
|
115 |
+
# Accounting: time=12 threads=1
|
116 |
+
# Ended (code 0) at Thu Jul 13 14:10:56 UTC 2023, elapsed time 12 seconds
|
exp/tts_stats_raw_phn_none/logdir/stats.15/config.yaml
ADDED
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
config: conf/tuning/finetune_tacotron2.yaml
|
2 |
+
print_config: false
|
3 |
+
log_level: INFO
|
4 |
+
dry_run: false
|
5 |
+
iterator_type: sequence
|
6 |
+
output_dir: exp/tts_stats_raw_phn_none/logdir/stats.15
|
7 |
+
ngpu: 0
|
8 |
+
seed: 0
|
9 |
+
num_workers: 1
|
10 |
+
num_att_plot: 3
|
11 |
+
dist_backend: nccl
|
12 |
+
dist_init_method: env://
|
13 |
+
dist_world_size: null
|
14 |
+
dist_rank: null
|
15 |
+
local_rank: null
|
16 |
+
dist_master_addr: null
|
17 |
+
dist_master_port: null
|
18 |
+
dist_launcher: null
|
19 |
+
multiprocessing_distributed: false
|
20 |
+
unused_parameters: false
|
21 |
+
sharded_ddp: false
|
22 |
+
cudnn_enabled: true
|
23 |
+
cudnn_benchmark: false
|
24 |
+
cudnn_deterministic: true
|
25 |
+
collect_stats: true
|
26 |
+
write_collected_feats: false
|
27 |
+
max_epoch: 120
|
28 |
+
patience: null
|
29 |
+
val_scheduler_criterion:
|
30 |
+
- valid
|
31 |
+
- loss
|
32 |
+
early_stopping_criterion:
|
33 |
+
- valid
|
34 |
+
- loss
|
35 |
+
- min
|
36 |
+
best_model_criterion:
|
37 |
+
- - valid
|
38 |
+
- loss
|
39 |
+
- min
|
40 |
+
- - train
|
41 |
+
- loss
|
42 |
+
- min
|
43 |
+
keep_nbest_models: 5
|
44 |
+
nbest_averaging_interval: 0
|
45 |
+
grad_clip: 1.0
|
46 |
+
grad_clip_type: 2.0
|
47 |
+
grad_noise: false
|
48 |
+
accum_grad: 1
|
49 |
+
no_forward_run: false
|
50 |
+
resume: false
|
51 |
+
train_dtype: float32
|
52 |
+
use_amp: false
|
53 |
+
log_interval: null
|
54 |
+
use_matplotlib: true
|
55 |
+
use_tensorboard: true
|
56 |
+
create_graph_in_tensorboard: false
|
57 |
+
use_wandb: false
|
58 |
+
wandb_project: null
|
59 |
+
wandb_id: null
|
60 |
+
wandb_entity: null
|
61 |
+
wandb_name: null
|
62 |
+
wandb_model_log_interval: -1
|
63 |
+
detect_anomaly: false
|
64 |
+
pretrain_path: null
|
65 |
+
init_param: []
|
66 |
+
ignore_init_mismatch: false
|
67 |
+
freeze_param: []
|
68 |
+
num_iters_per_epoch: 200
|
69 |
+
batch_size: 20
|
70 |
+
valid_batch_size: null
|
71 |
+
batch_bins: 1600000
|
72 |
+
valid_batch_bins: null
|
73 |
+
train_shape_file:
|
74 |
+
- exp/tts_stats_raw_phn_none/logdir/train.15.scp
|
75 |
+
valid_shape_file:
|
76 |
+
- exp/tts_stats_raw_phn_none/logdir/valid.15.scp
|
77 |
+
batch_type: numel
|
78 |
+
valid_batch_type: null
|
79 |
+
fold_length: []
|
80 |
+
sort_in_batch: descending
|
81 |
+
sort_batch: descending
|
82 |
+
multiple_iterator: false
|
83 |
+
chunk_length: 500
|
84 |
+
chunk_shift_ratio: 0.5
|
85 |
+
num_cache_chunks: 1024
|
86 |
+
chunk_excluded_key_prefixes: []
|
87 |
+
train_data_path_and_name_and_type:
|
88 |
+
- - dump/raw/train/text
|
89 |
+
- text
|
90 |
+
- text
|
91 |
+
- - dump/raw/train/wav.scp
|
92 |
+
- speech
|
93 |
+
- sound
|
94 |
+
valid_data_path_and_name_and_type:
|
95 |
+
- - dump/raw/dev/text
|
96 |
+
- text
|
97 |
+
- text
|
98 |
+
- - dump/raw/dev/wav.scp
|
99 |
+
- speech
|
100 |
+
- sound
|
101 |
+
allow_variable_data_keys: false
|
102 |
+
max_cache_size: 0.0
|
103 |
+
max_cache_fd: 32
|
104 |
+
valid_max_cache_size: null
|
105 |
+
exclude_weight_decay: false
|
106 |
+
exclude_weight_decay_conf: {}
|
107 |
+
optim: adam
|
108 |
+
optim_conf:
|
109 |
+
lr: 0.001
|
110 |
+
eps: 1.0e-06
|
111 |
+
weight_decay: 0.0
|
112 |
+
scheduler: null
|
113 |
+
scheduler_conf: {}
|
114 |
+
token_list:
|
115 |
+
- <blank>
|
116 |
+
- <unk>
|
117 |
+
- a
|
118 |
+
- sil
|
119 |
+
- l
|
120 |
+
- aa
|
121 |
+
- m
|
122 |
+
- ii0
|
123 |
+
- t
|
124 |
+
- <
|
125 |
+
- n
|
126 |
+
- r
|
127 |
+
- E
|
128 |
+
- i0
|
129 |
+
- b
|
130 |
+
- uu0
|
131 |
+
- f
|
132 |
+
- i1
|
133 |
+
- k
|
134 |
+
- w
|
135 |
+
- A
|
136 |
+
- s
|
137 |
+
- y
|
138 |
+
- d
|
139 |
+
- q
|
140 |
+
- h
|
141 |
+
- H
|
142 |
+
- $
|
143 |
+
- u0
|
144 |
+
- AA
|
145 |
+
- j
|
146 |
+
- T
|
147 |
+
- x
|
148 |
+
- S
|
149 |
+
- z
|
150 |
+
- ll
|
151 |
+
- I1
|
152 |
+
- D
|
153 |
+
- II0
|
154 |
+
- g
|
155 |
+
- tt
|
156 |
+
- rr
|
157 |
+
- I0
|
158 |
+
- UU0
|
159 |
+
- dd
|
160 |
+
- u1
|
161 |
+
- U0
|
162 |
+
- mm
|
163 |
+
- nn
|
164 |
+
- '*'
|
165 |
+
- $$
|
166 |
+
- bb
|
167 |
+
- yy
|
168 |
+
- ss
|
169 |
+
- jj
|
170 |
+
- ww
|
171 |
+
- ^
|
172 |
+
- SS
|
173 |
+
- TT
|
174 |
+
- Z
|
175 |
+
- zz
|
176 |
+
- kk
|
177 |
+
- U1
|
178 |
+
- HH
|
179 |
+
- ff
|
180 |
+
- qq
|
181 |
+
- xx
|
182 |
+
- ^^
|
183 |
+
- DD
|
184 |
+
- hh
|
185 |
+
- EE
|
186 |
+
- ZZ
|
187 |
+
- '**'
|
188 |
+
- aaaa
|
189 |
+
- ssss
|
190 |
+
- v
|
191 |
+
- uu1
|
192 |
+
- jjjj
|
193 |
+
- <sos/eos>
|
194 |
+
odim: null
|
195 |
+
model_conf: {}
|
196 |
+
use_preprocessor: true
|
197 |
+
token_type: phn
|
198 |
+
bpemodel: null
|
199 |
+
non_linguistic_symbols: null
|
200 |
+
cleaner: null
|
201 |
+
g2p: null
|
202 |
+
feats_extract: fbank
|
203 |
+
feats_extract_conf:
|
204 |
+
n_fft: 1024
|
205 |
+
hop_length: 256
|
206 |
+
win_length: null
|
207 |
+
fs: 22050
|
208 |
+
fmin: 80
|
209 |
+
fmax: 7600
|
210 |
+
n_mels: 80
|
211 |
+
normalize: null
|
212 |
+
normalize_conf: {}
|
213 |
+
tts: tacotron2
|
214 |
+
tts_conf:
|
215 |
+
embed_dim: 512
|
216 |
+
elayers: 1
|
217 |
+
eunits: 512
|
218 |
+
econv_layers: 3
|
219 |
+
econv_chans: 512
|
220 |
+
econv_filts: 5
|
221 |
+
atype: location
|
222 |
+
adim: 512
|
223 |
+
aconv_chans: 32
|
224 |
+
aconv_filts: 15
|
225 |
+
cumulate_att_w: true
|
226 |
+
dlayers: 2
|
227 |
+
dunits: 1024
|
228 |
+
prenet_layers: 2
|
229 |
+
prenet_units: 256
|
230 |
+
postnet_layers: 5
|
231 |
+
postnet_chans: 512
|
232 |
+
postnet_filts: 5
|
233 |
+
output_activation: null
|
234 |
+
use_batch_norm: true
|
235 |
+
use_concate: true
|
236 |
+
use_residual: false
|
237 |
+
dropout_rate: 0.5
|
238 |
+
zoneout_rate: 0.1
|
239 |
+
reduction_factor: 3
|
240 |
+
spk_embed_dim: null
|
241 |
+
use_masking: true
|
242 |
+
bce_pos_weight: 20.0
|
243 |
+
use_guided_attn_loss: true
|
244 |
+
guided_attn_loss_sigma: 0.4
|
245 |
+
guided_attn_loss_lambda: 1.0
|
246 |
+
pitch_extract: null
|
247 |
+
pitch_extract_conf:
|
248 |
+
fs: 22050
|
249 |
+
n_fft: 1024
|
250 |
+
hop_length: 256
|
251 |
+
f0max: 400
|
252 |
+
f0min: 80
|
253 |
+
pitch_normalize: null
|
254 |
+
pitch_normalize_conf: {}
|
255 |
+
energy_extract: null
|
256 |
+
energy_extract_conf:
|
257 |
+
fs: 22050
|
258 |
+
n_fft: 1024
|
259 |
+
hop_length: 256
|
260 |
+
win_length: null
|
261 |
+
energy_normalize: null
|
262 |
+
energy_normalize_conf: {}
|
263 |
+
required:
|
264 |
+
- output_dir
|
265 |
+
- token_list
|
266 |
+
version: '202304'
|
267 |
+
distributed: false
|
exp/tts_stats_raw_phn_none/logdir/stats.15/train/batch_keys
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
text
|
2 |
+
speech
|
exp/tts_stats_raw_phn_none/logdir/stats.15/train/feats_lengths_stats.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b7321393931081a400396aafc1edb9605c0808638ac13716f0f23942f51e167a
|
3 |
+
size 778
|
exp/tts_stats_raw_phn_none/logdir/stats.15/train/feats_stats.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:239fc3342b13b16bd839d1ac8b21666aad2e96b1b2341e9c6c4c53e063f99526
|
3 |
+
size 1402
|
exp/tts_stats_raw_phn_none/logdir/stats.15/train/speech_shape
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
19928 141056
|
2 |
+
19931 133376
|
3 |
+
19935 203520
|
4 |
+
19938 102144
|
5 |
+
19944 126464
|
6 |
+
19946 116992
|
7 |
+
19947 154112
|
8 |
+
19948 171637
|
9 |
+
19949 141056
|
10 |
+
19951 214272
|
11 |
+
19952 165376
|
12 |
+
19955 134912
|
13 |
+
19957 150596
|
14 |
+
19959 176896
|
15 |
+
19976 169472
|
16 |
+
19979 119808
|
17 |
+
19981 134144
|
18 |
+
19984 171520
|
19 |
+
19990 235008
|
20 |
+
19998 195840
|
21 |
+
200 125440
|
22 |
+
20001 184576
|
23 |
+
20005 108032
|
24 |
+
20020 164608
|
25 |
+
20022 235264
|
26 |
+
20029 174080
|
27 |
+
20038 216576
|
28 |
+
20042 241920
|
29 |
+
20051 203776
|
30 |
+
20055 168448
|
31 |
+
20062 152064
|
32 |
+
20080 219136
|
33 |
+
20087 116992
|
34 |
+
20095 193792
|
35 |
+
201 119040
|
36 |
+
20109 167424
|
37 |
+
20119 149760
|
38 |
+
20120 154368
|
39 |
+
20121 172288
|
40 |
+
20128 143872
|
41 |
+
20144 112128
|
42 |
+
20147 167168
|
43 |
+
20183 139520
|
exp/tts_stats_raw_phn_none/logdir/stats.15/train/stats_keys
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
feats
|
2 |
+
feats_lengths
|
exp/tts_stats_raw_phn_none/logdir/stats.15/train/text_shape
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
19928 71
|
2 |
+
19931 84
|
3 |
+
19935 117
|
4 |
+
19938 58
|
5 |
+
19944 71
|
6 |
+
19946 52
|
7 |
+
19947 71
|
8 |
+
19948 80
|
9 |
+
19949 64
|
10 |
+
19951 133
|
11 |
+
19952 102
|
12 |
+
19955 64
|
13 |
+
19957 81
|
14 |
+
19959 106
|
15 |
+
19976 97
|
16 |
+
19979 66
|
17 |
+
19981 66
|
18 |
+
19984 80
|
19 |
+
19990 127
|
20 |
+
19998 100
|
21 |
+
200 64
|
22 |
+
20001 98
|
23 |
+
20005 61
|
24 |
+
20020 68
|
25 |
+
20022 143
|
26 |
+
20029 103
|
27 |
+
20038 123
|
28 |
+
20042 136
|
29 |
+
20051 106
|
30 |
+
20055 97
|
31 |
+
20062 90
|
32 |
+
20080 124
|
33 |
+
20087 52
|
34 |
+
20095 101
|
35 |
+
201 67
|
36 |
+
20109 82
|
37 |
+
20119 64
|
38 |
+
20120 93
|
39 |
+
20121 83
|
40 |
+
20128 67
|
41 |
+
20144 69
|
42 |
+
20147 78
|
43 |
+
20183 70
|
exp/tts_stats_raw_phn_none/logdir/stats.15/valid/batch_keys
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
text
|
2 |
+
speech
|
exp/tts_stats_raw_phn_none/logdir/stats.15/valid/feats_lengths_stats.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7ab01415ef2a97eaa04e81355080bc38b3f9b0343f8e97e91044090b6ff63685
|
3 |
+
size 778
|
exp/tts_stats_raw_phn_none/logdir/stats.15/valid/feats_stats.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:89cf49faa040cf392b87903167b64d4599f1d322f1a2937c91823fca48e139a9
|
3 |
+
size 1402
|
exp/tts_stats_raw_phn_none/logdir/stats.15/valid/speech_shape
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
19769 152064
|
2 |
+
19771 194816
|
exp/tts_stats_raw_phn_none/logdir/stats.15/valid/stats_keys
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
feats
|
2 |
+
feats_lengths
|
exp/tts_stats_raw_phn_none/logdir/stats.15/valid/text_shape
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
19769 84
|
2 |
+
19771 108
|
exp/tts_stats_raw_phn_none/logdir/stats.17.log
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# python3 -m espnet2.bin.tts_train --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.17.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.17.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.17 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
|
2 |
+
# Started at Thu Jul 13 14:10:56 UTC 2023
|
3 |
+
#
|
4 |
+
/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
|
5 |
+
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
|
6 |
+
/opt/conda/bin/python3 /kaggle/working/espnet/espnet2/bin/tts_train.py --collect_stats true --write_collected_feats false --use_preprocessor true --token_type phn --token_list dump/token_list/phn_none/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --normalize none --pitch_normalize none --energy_normalize none --train_data_path_and_name_and_type dump/raw/train/text,text,text --train_data_path_and_name_and_type dump/raw/train/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev/text,text,text --valid_data_path_and_name_and_type dump/raw/dev/wav.scp,speech,sound --train_shape_file exp/tts_stats_raw_phn_none/logdir/train.17.scp --valid_shape_file exp/tts_stats_raw_phn_none/logdir/valid.17.scp --output_dir exp/tts_stats_raw_phn_none/logdir/stats.17 --config conf/tuning/finetune_tacotron2.yaml --feats_extract fbank --feats_extract_conf n_fft=1024 --feats_extract_conf hop_length=256 --feats_extract_conf win_length=null --feats_extract_conf fs=22050 --feats_extract_conf fmin=80 --feats_extract_conf fmax=7600 --feats_extract_conf n_mels=80 --pitch_extract_conf fs=22050 --pitch_extract_conf n_fft=1024 --pitch_extract_conf hop_length=256 --pitch_extract_conf f0max=400 --pitch_extract_conf f0min=80 --energy_extract_conf fs=22050 --energy_extract_conf n_fft=1024 --energy_extract_conf hop_length=256 --energy_extract_conf win_length=null
|
7 |
+
[7850374a3496] 2023-07-13 14:11:04,338 (tts:293) INFO: Vocabulary size: 79
|
8 |
+
[7850374a3496] 2023-07-13 14:11:05,061 (abs_task:1203) INFO: pytorch.version=2.0.0, cuda.available=True, cudnn.version=8900, cudnn.benchmark=False, cudnn.deterministic=True
|
9 |
+
[7850374a3496] 2023-07-13 14:11:05,064 (abs_task:1204) INFO: Model structure:
|
10 |
+
ESPnetTTSModel(
|
11 |
+
(feats_extract): LogMelFbank(
|
12 |
+
(stft): Stft(n_fft=1024, win_length=1024, hop_length=256, center=True, normalized=False, onesided=True)
|
13 |
+
(logmel): LogMel(sr=22050, n_fft=1024, n_mels=80, fmin=80, fmax=7600, htk=False)
|
14 |
+
)
|
15 |
+
(tts): Tacotron2(
|
16 |
+
(enc): Encoder(
|
17 |
+
(embed): Embedding(79, 512, padding_idx=0)
|
18 |
+
(convs): ModuleList(
|
19 |
+
(0-2): 3 x Sequential(
|
20 |
+
(0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
21 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
22 |
+
(2): ReLU()
|
23 |
+
(3): Dropout(p=0.5, inplace=False)
|
24 |
+
)
|
25 |
+
)
|
26 |
+
(blstm): LSTM(512, 256, batch_first=True, bidirectional=True)
|
27 |
+
)
|
28 |
+
(dec): Decoder(
|
29 |
+
(att): AttLoc(
|
30 |
+
(mlp_enc): Linear(in_features=512, out_features=512, bias=True)
|
31 |
+
(mlp_dec): Linear(in_features=1024, out_features=512, bias=False)
|
32 |
+
(mlp_att): Linear(in_features=32, out_features=512, bias=False)
|
33 |
+
(loc_conv): Conv2d(1, 32, kernel_size=(1, 31), stride=(1, 1), padding=(0, 15), bias=False)
|
34 |
+
(gvec): Linear(in_features=512, out_features=1, bias=True)
|
35 |
+
)
|
36 |
+
(lstm): ModuleList(
|
37 |
+
(0): ZoneOutCell(
|
38 |
+
(cell): LSTMCell(768, 1024)
|
39 |
+
)
|
40 |
+
(1): ZoneOutCell(
|
41 |
+
(cell): LSTMCell(1024, 1024)
|
42 |
+
)
|
43 |
+
)
|
44 |
+
(prenet): Prenet(
|
45 |
+
(prenet): ModuleList(
|
46 |
+
(0): Sequential(
|
47 |
+
(0): Linear(in_features=80, out_features=256, bias=True)
|
48 |
+
(1): ReLU()
|
49 |
+
)
|
50 |
+
(1): Sequential(
|
51 |
+
(0): Linear(in_features=256, out_features=256, bias=True)
|
52 |
+
(1): ReLU()
|
53 |
+
)
|
54 |
+
)
|
55 |
+
)
|
56 |
+
(postnet): Postnet(
|
57 |
+
(postnet): ModuleList(
|
58 |
+
(0): Sequential(
|
59 |
+
(0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
60 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
61 |
+
(2): Tanh()
|
62 |
+
(3): Dropout(p=0.5, inplace=False)
|
63 |
+
)
|
64 |
+
(1-3): 3 x Sequential(
|
65 |
+
(0): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
66 |
+
(1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
67 |
+
(2): Tanh()
|
68 |
+
(3): Dropout(p=0.5, inplace=False)
|
69 |
+
)
|
70 |
+
(4): Sequential(
|
71 |
+
(0): Conv1d(512, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
|
72 |
+
(1): BatchNorm1d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
73 |
+
(2): Dropout(p=0.5, inplace=False)
|
74 |
+
)
|
75 |
+
)
|
76 |
+
)
|
77 |
+
(feat_out): Linear(in_features=1536, out_features=240, bias=False)
|
78 |
+
(prob_out): Linear(in_features=1536, out_features=3, bias=True)
|
79 |
+
)
|
80 |
+
(taco2_loss): Tacotron2Loss(
|
81 |
+
(l1_criterion): L1Loss()
|
82 |
+
(mse_criterion): MSELoss()
|
83 |
+
(bce_criterion): BCEWithLogitsLoss()
|
84 |
+
)
|
85 |
+
(attn_loss): GuidedAttentionLoss()
|
86 |
+
)
|
87 |
+
)
|
88 |
+
|
89 |
+
Model summary:
|
90 |
+
Class Name: ESPnetTTSModel
|
91 |
+
Total Number of model parameters: 26.91 M
|
92 |
+
Number of trainable parameters: 26.91 M (100.0%)
|
93 |
+
Size: 107.63 MB
|
94 |
+
Type: torch.float32
|
95 |
+
[7850374a3496] 2023-07-13 14:11:05,064 (abs_task:1207) INFO: Optimizer:
|
96 |
+
Adam (
|
97 |
+
Parameter Group 0
|
98 |
+
amsgrad: False
|
99 |
+
betas: (0.9, 0.999)
|
100 |
+
capturable: False
|
101 |
+
differentiable: False
|
102 |
+
eps: 1e-06
|
103 |
+
foreach: None
|
104 |
+
fused: None
|
105 |
+
lr: 0.001
|
106 |
+
maximize: False
|
107 |
+
weight_decay: 0.0
|
108 |
+
)
|
109 |
+
[7850374a3496] 2023-07-13 14:11:05,064 (abs_task:1208) INFO: Scheduler: None
|
110 |
+
[7850374a3496] 2023-07-13 14:11:05,065 (abs_task:1217) INFO: Saving the configuration in exp/tts_stats_raw_phn_none/logdir/stats.17/config.yaml
|
111 |
+
[7850374a3496] 2023-07-13 14:11:05,100 (abs_task:1228) INFO: Namespace(config='conf/tuning/finetune_tacotron2.yaml', print_config=False, log_level='INFO', dry_run=False, iterator_type='sequence', output_dir='exp/tts_stats_raw_phn_none/logdir/stats.17', ngpu=0, seed=0, num_workers=1, num_att_plot=3, dist_backend='nccl', dist_init_method='env://', dist_world_size=None, dist_rank=None, local_rank=None, dist_master_addr=None, dist_master_port=None, dist_launcher=None, multiprocessing_distributed=False, unused_parameters=False, sharded_ddp=False, cudnn_enabled=True, cudnn_benchmark=False, cudnn_deterministic=True, collect_stats=True, write_collected_feats=False, max_epoch=120, patience=None, val_scheduler_criterion=('valid', 'loss'), early_stopping_criterion=('valid', 'loss', 'min'), best_model_criterion=[['valid', 'loss', 'min'], ['train', 'loss', 'min']], keep_nbest_models=5, nbest_averaging_interval=0, grad_clip=1.0, grad_clip_type=2.0, grad_noise=False, accum_grad=1, no_forward_run=False, resume=False, train_dtype='float32', use_amp=False, log_interval=None, use_matplotlib=True, use_tensorboard=True, create_graph_in_tensorboard=False, use_wandb=False, wandb_project=None, wandb_id=None, wandb_entity=None, wandb_name=None, wandb_model_log_interval=-1, detect_anomaly=False, pretrain_path=None, init_param=[], ignore_init_mismatch=False, freeze_param=[], num_iters_per_epoch=200, batch_size=20, valid_batch_size=None, batch_bins=1600000, valid_batch_bins=None, train_shape_file=['exp/tts_stats_raw_phn_none/logdir/train.17.scp'], valid_shape_file=['exp/tts_stats_raw_phn_none/logdir/valid.17.scp'], batch_type='numel', valid_batch_type=None, fold_length=[], sort_in_batch='descending', sort_batch='descending', multiple_iterator=False, chunk_length=500, chunk_shift_ratio=0.5, num_cache_chunks=1024, chunk_excluded_key_prefixes=[], train_data_path_and_name_and_type=[('dump/raw/train/text', 'text', 'text'), ('dump/raw/train/wav.scp', 'speech', 'sound')], valid_data_path_and_name_and_type=[('dump/raw/dev/text', 'text', 'text'), ('dump/raw/dev/wav.scp', 'speech', 'sound')], allow_variable_data_keys=False, max_cache_size=0.0, max_cache_fd=32, valid_max_cache_size=None, exclude_weight_decay=False, exclude_weight_decay_conf={}, optim='adam', optim_conf={'lr': 0.001, 'eps': 1e-06, 'weight_decay': 0.0}, scheduler=None, scheduler_conf={}, token_list=['<blank>', '<unk>', 'a', 'sil', 'l', 'aa', 'm', 'ii0', 't', '<', 'n', 'r', 'E', 'i0', 'b', 'uu0', 'f', 'i1', 'k', 'w', 'A', 's', 'y', 'd', 'q', 'h', 'H', '$', 'u0', 'AA', 'j', 'T', 'x', 'S', 'z', 'll', 'I1', 'D', 'II0', 'g', 'tt', 'rr', 'I0', 'UU0', 'dd', 'u1', 'U0', 'mm', 'nn', '*', '$$', 'bb', 'yy', 'ss', 'jj', 'ww', '^', 'SS', 'TT', 'Z', 'zz', 'kk', 'U1', 'HH', 'ff', 'qq', 'xx', '^^', 'DD', 'hh', 'EE', 'ZZ', '**', 'aaaa', 'ssss', 'v', 'uu1', 'jjjj', '<sos/eos>'], odim=None, model_conf={}, use_preprocessor=True, token_type='phn', bpemodel=None, non_linguistic_symbols=None, cleaner=None, g2p=None, feats_extract='fbank', feats_extract_conf={'n_fft': 1024, 'hop_length': 256, 'win_length': None, 'fs': 22050, 'fmin': 80, 'fmax': 7600, 'n_mels': 80}, normalize=None, normalize_conf={}, tts='tacotron2', tts_conf={'embed_dim': 512, 'elayers': 1, 'eunits': 512, 'econv_layers': 3, 'econv_chans': 512, 'econv_filts': 5, 'atype': 'location', 'adim': 512, 'aconv_chans': 32, 'aconv_filts': 15, 'cumulate_att_w': True, 'dlayers': 2, 'dunits': 1024, 'prenet_layers': 2, 'prenet_units': 256, 'postnet_layers': 5, 'postnet_chans': 512, 'postnet_filts': 5, 'output_activation': None, 'use_batch_norm': True, 'use_concate': True, 'use_residual': False, 'dropout_rate': 0.5, 'zoneout_rate': 0.1, 'reduction_factor': 3, 'spk_embed_dim': None, 'use_masking': True, 'bce_pos_weight': 20.0, 'use_guided_attn_loss': True, 'guided_attn_loss_sigma': 0.4, 'guided_attn_loss_lambda': 1.0}, pitch_extract=None, pitch_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'f0max': 400, 'f0min': 80}, pitch_normalize=None, pitch_normalize_conf={}, energy_extract=None, energy_extract_conf={'fs': 22050, 'n_fft': 1024, 'hop_length': 256, 'win_length': None}, energy_normalize=None, energy_normalize_conf={}, required=['output_dir', 'token_list'], version='202304', distributed=False)
|
112 |
+
/opt/conda/lib/python3.10/site-packages/torch/functional.py:641: UserWarning: stft with return_complex=False is deprecated. In a future pytorch release, stft will return complex tensors for all inputs, and return_complex=False will raise an error.
|
113 |
+
Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/SpectralOps.cpp:862.)
|
114 |
+
return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined]
|
115 |
+
# Accounting: time=13 threads=1
|
116 |
+
# Ended (code 0) at Thu Jul 13 14:11:09 UTC 2023, elapsed time 13 seconds
|
exp/tts_stats_raw_phn_none/logdir/stats.17/config.yaml
ADDED
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
config: conf/tuning/finetune_tacotron2.yaml
|
2 |
+
print_config: false
|
3 |
+
log_level: INFO
|
4 |
+
dry_run: false
|
5 |
+
iterator_type: sequence
|
6 |
+
output_dir: exp/tts_stats_raw_phn_none/logdir/stats.17
|
7 |
+
ngpu: 0
|
8 |
+
seed: 0
|
9 |
+
num_workers: 1
|
10 |
+
num_att_plot: 3
|
11 |
+
dist_backend: nccl
|
12 |
+
dist_init_method: env://
|
13 |
+
dist_world_size: null
|
14 |
+
dist_rank: null
|
15 |
+
local_rank: null
|
16 |
+
dist_master_addr: null
|
17 |
+
dist_master_port: null
|
18 |
+
dist_launcher: null
|
19 |
+
multiprocessing_distributed: false
|
20 |
+
unused_parameters: false
|
21 |
+
sharded_ddp: false
|
22 |
+
cudnn_enabled: true
|
23 |
+
cudnn_benchmark: false
|
24 |
+
cudnn_deterministic: true
|
25 |
+
collect_stats: true
|
26 |
+
write_collected_feats: false
|
27 |
+
max_epoch: 120
|
28 |
+
patience: null
|
29 |
+
val_scheduler_criterion:
|
30 |
+
- valid
|
31 |
+
- loss
|
32 |
+
early_stopping_criterion:
|
33 |
+
- valid
|
34 |
+
- loss
|
35 |
+
- min
|
36 |
+
best_model_criterion:
|
37 |
+
- - valid
|
38 |
+
- loss
|
39 |
+
- min
|
40 |
+
- - train
|
41 |
+
- loss
|
42 |
+
- min
|
43 |
+
keep_nbest_models: 5
|
44 |
+
nbest_averaging_interval: 0
|
45 |
+
grad_clip: 1.0
|
46 |
+
grad_clip_type: 2.0
|
47 |
+
grad_noise: false
|
48 |
+
accum_grad: 1
|
49 |
+
no_forward_run: false
|
50 |
+
resume: false
|
51 |
+
train_dtype: float32
|
52 |
+
use_amp: false
|
53 |
+
log_interval: null
|
54 |
+
use_matplotlib: true
|
55 |
+
use_tensorboard: true
|
56 |
+
create_graph_in_tensorboard: false
|
57 |
+
use_wandb: false
|
58 |
+
wandb_project: null
|
59 |
+
wandb_id: null
|
60 |
+
wandb_entity: null
|
61 |
+
wandb_name: null
|
62 |
+
wandb_model_log_interval: -1
|
63 |
+
detect_anomaly: false
|
64 |
+
pretrain_path: null
|
65 |
+
init_param: []
|
66 |
+
ignore_init_mismatch: false
|
67 |
+
freeze_param: []
|
68 |
+
num_iters_per_epoch: 200
|
69 |
+
batch_size: 20
|
70 |
+
valid_batch_size: null
|
71 |
+
batch_bins: 1600000
|
72 |
+
valid_batch_bins: null
|
73 |
+
train_shape_file:
|
74 |
+
- exp/tts_stats_raw_phn_none/logdir/train.17.scp
|
75 |
+
valid_shape_file:
|
76 |
+
- exp/tts_stats_raw_phn_none/logdir/valid.17.scp
|
77 |
+
batch_type: numel
|
78 |
+
valid_batch_type: null
|
79 |
+
fold_length: []
|
80 |
+
sort_in_batch: descending
|
81 |
+
sort_batch: descending
|
82 |
+
multiple_iterator: false
|
83 |
+
chunk_length: 500
|
84 |
+
chunk_shift_ratio: 0.5
|
85 |
+
num_cache_chunks: 1024
|
86 |
+
chunk_excluded_key_prefixes: []
|
87 |
+
train_data_path_and_name_and_type:
|
88 |
+
- - dump/raw/train/text
|
89 |
+
- text
|
90 |
+
- text
|
91 |
+
- - dump/raw/train/wav.scp
|
92 |
+
- speech
|
93 |
+
- sound
|
94 |
+
valid_data_path_and_name_and_type:
|
95 |
+
- - dump/raw/dev/text
|
96 |
+
- text
|
97 |
+
- text
|
98 |
+
- - dump/raw/dev/wav.scp
|
99 |
+
- speech
|
100 |
+
- sound
|
101 |
+
allow_variable_data_keys: false
|
102 |
+
max_cache_size: 0.0
|
103 |
+
max_cache_fd: 32
|
104 |
+
valid_max_cache_size: null
|
105 |
+
exclude_weight_decay: false
|
106 |
+
exclude_weight_decay_conf: {}
|
107 |
+
optim: adam
|
108 |
+
optim_conf:
|
109 |
+
lr: 0.001
|
110 |
+
eps: 1.0e-06
|
111 |
+
weight_decay: 0.0
|
112 |
+
scheduler: null
|
113 |
+
scheduler_conf: {}
|
114 |
+
token_list:
|
115 |
+
- <blank>
|
116 |
+
- <unk>
|
117 |
+
- a
|
118 |
+
- sil
|
119 |
+
- l
|
120 |
+
- aa
|
121 |
+
- m
|
122 |
+
- ii0
|
123 |
+
- t
|
124 |
+
- <
|
125 |
+
- n
|
126 |
+
- r
|
127 |
+
- E
|
128 |
+
- i0
|
129 |
+
- b
|
130 |
+
- uu0
|
131 |
+
- f
|
132 |
+
- i1
|
133 |
+
- k
|
134 |
+
- w
|
135 |
+
- A
|
136 |
+
- s
|
137 |
+
- y
|
138 |
+
- d
|
139 |
+
- q
|
140 |
+
- h
|
141 |
+
- H
|
142 |
+
- $
|
143 |
+
- u0
|
144 |
+
- AA
|
145 |
+
- j
|
146 |
+
- T
|
147 |
+
- x
|
148 |
+
- S
|
149 |
+
- z
|
150 |
+
- ll
|
151 |
+
- I1
|
152 |
+
- D
|
153 |
+
- II0
|
154 |
+
- g
|
155 |
+
- tt
|
156 |
+
- rr
|
157 |
+
- I0
|
158 |
+
- UU0
|
159 |
+
- dd
|
160 |
+
- u1
|
161 |
+
- U0
|
162 |
+
- mm
|
163 |
+
- nn
|
164 |
+
- '*'
|
165 |
+
- $$
|
166 |
+
- bb
|
167 |
+
- yy
|
168 |
+
- ss
|
169 |
+
- jj
|
170 |
+
- ww
|
171 |
+
- ^
|
172 |
+
- SS
|
173 |
+
- TT
|
174 |
+
- Z
|
175 |
+
- zz
|
176 |
+
- kk
|
177 |
+
- U1
|
178 |
+
- HH
|
179 |
+
- ff
|
180 |
+
- qq
|
181 |
+
- xx
|
182 |
+
- ^^
|
183 |
+
- DD
|
184 |
+
- hh
|
185 |
+
- EE
|
186 |
+
- ZZ
|
187 |
+
- '**'
|
188 |
+
- aaaa
|
189 |
+
- ssss
|
190 |
+
- v
|
191 |
+
- uu1
|
192 |
+
- jjjj
|
193 |
+
- <sos/eos>
|
194 |
+
odim: null
|
195 |
+
model_conf: {}
|
196 |
+
use_preprocessor: true
|
197 |
+
token_type: phn
|
198 |
+
bpemodel: null
|
199 |
+
non_linguistic_symbols: null
|
200 |
+
cleaner: null
|
201 |
+
g2p: null
|
202 |
+
feats_extract: fbank
|
203 |
+
feats_extract_conf:
|
204 |
+
n_fft: 1024
|
205 |
+
hop_length: 256
|
206 |
+
win_length: null
|
207 |
+
fs: 22050
|
208 |
+
fmin: 80
|
209 |
+
fmax: 7600
|
210 |
+
n_mels: 80
|
211 |
+
normalize: null
|
212 |
+
normalize_conf: {}
|
213 |
+
tts: tacotron2
|
214 |
+
tts_conf:
|
215 |
+
embed_dim: 512
|
216 |
+
elayers: 1
|
217 |
+
eunits: 512
|
218 |
+
econv_layers: 3
|
219 |
+
econv_chans: 512
|
220 |
+
econv_filts: 5
|
221 |
+
atype: location
|
222 |
+
adim: 512
|
223 |
+
aconv_chans: 32
|
224 |
+
aconv_filts: 15
|
225 |
+
cumulate_att_w: true
|
226 |
+
dlayers: 2
|
227 |
+
dunits: 1024
|
228 |
+
prenet_layers: 2
|
229 |
+
prenet_units: 256
|
230 |
+
postnet_layers: 5
|
231 |
+
postnet_chans: 512
|
232 |
+
postnet_filts: 5
|
233 |
+
output_activation: null
|
234 |
+
use_batch_norm: true
|
235 |
+
use_concate: true
|
236 |
+
use_residual: false
|
237 |
+
dropout_rate: 0.5
|
238 |
+
zoneout_rate: 0.1
|
239 |
+
reduction_factor: 3
|
240 |
+
spk_embed_dim: null
|
241 |
+
use_masking: true
|
242 |
+
bce_pos_weight: 20.0
|
243 |
+
use_guided_attn_loss: true
|
244 |
+
guided_attn_loss_sigma: 0.4
|
245 |
+
guided_attn_loss_lambda: 1.0
|
246 |
+
pitch_extract: null
|
247 |
+
pitch_extract_conf:
|
248 |
+
fs: 22050
|
249 |
+
n_fft: 1024
|
250 |
+
hop_length: 256
|
251 |
+
f0max: 400
|
252 |
+
f0min: 80
|
253 |
+
pitch_normalize: null
|
254 |
+
pitch_normalize_conf: {}
|
255 |
+
energy_extract: null
|
256 |
+
energy_extract_conf:
|
257 |
+
fs: 22050
|
258 |
+
n_fft: 1024
|
259 |
+
hop_length: 256
|
260 |
+
win_length: null
|
261 |
+
energy_normalize: null
|
262 |
+
energy_normalize_conf: {}
|
263 |
+
required:
|
264 |
+
- output_dir
|
265 |
+
- token_list
|
266 |
+
version: '202304'
|
267 |
+
distributed: false
|
exp/tts_stats_raw_phn_none/logdir/stats.17/train/batch_keys
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
text
|
2 |
+
speech
|
exp/tts_stats_raw_phn_none/logdir/stats.17/train/feats_lengths_stats.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:503b8c916b78f4942fd868b9337455d0a4593217bf37efc50c5e8192e7949a22
|
3 |
+
size 778
|
exp/tts_stats_raw_phn_none/logdir/stats.17/train/feats_stats.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:242283537138ddfc5699bdd17945a6d6bf4a95ff1d368666989414d0b47ca626
|
3 |
+
size 1402
|