File size: 5,033 Bytes
3b5f98a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
accum_grad: 8
allow_variable_data_keys: false
batch_bins: 3000000
batch_size: 20
batch_type: numel
best_model_criterion:
- - valid
  - loss
  - min
- - train
  - loss
  - min
bpemodel: null
chunk_length: 500
chunk_shift_ratio: 0.5
cleaner: null
collect_stats: false
config: conf/tuning/train_fastspeech2.yaml
cudnn_benchmark: false
cudnn_deterministic: true
cudnn_enabled: true
detect_anomaly: false
dist_backend: nccl
dist_init_method: env://
dist_launcher: null
dist_master_addr: localhost
dist_master_port: 59485
dist_rank: 0
dist_world_size: 8
distributed: true
dry_run: false
early_stopping_criterion:
- valid
- loss
- min
energy_extract: energy
energy_extract_conf:
  fs: 22050
  hop_length: 256
  n_fft: 1024
  reduction_factor: 1
  win_length: null
energy_normalize: global_mvn
energy_normalize_conf:
  stats_file: /home/speech/Fastspeech2_HS/english/male/model/energy_stats.npz
feats_extract: fbank
feats_extract_conf:
  fmax: 8000
  fmin: 0
  fs: 22050
  hop_length: 256
  n_fft: 1024
  n_mels: 80
  win_length: null
fold_length:
- 150
- 204800
freeze_param: []
g2p: g2p_en_no_space
grad_clip: 1.0
grad_clip_type: 2.0
grad_noise: false
ignore_init_mismatch: false
init_param: []
iterator_type: sequence
keep_nbest_models: 5
local_rank: 0
log_interval: null
log_level: INFO
max_cache_fd: 32
max_cache_size: 0.0
max_epoch: 1000
model_conf: {}
multiple_iterator: false
multiprocessing_distributed: true
ngpu: 1
no_forward_run: false
non_linguistic_symbols: null
normalize: global_mvn
normalize_conf:
  stats_file: /home/speech/Fastspeech2_HS/english/male/model/feats_stats.npz
num_att_plot: 3
num_cache_chunks: 1024
num_iters_per_epoch: 800
num_workers: 1
odim: null
optim: adam
optim_conf:
  lr: 1.0
output_dir: exp/tts_train_fastspeech2_raw_char_None
patience: null
pitch_extract: dio
pitch_extract_conf:
  f0max: 400
  f0min: 40
  fs: 22050
  hop_length: 256
  n_fft: 1024
  reduction_factor: 1
pitch_normalize: global_mvn
pitch_normalize_conf:
  stats_file: /home/speech/Fastspeech2_HS/english/male/model/pitch_stats.npz
pretrain_path: null
print_config: false
required:
- output_dir
- token_list
resume: true
scheduler: noamlr
scheduler_conf:
  model_size: 384
  warmup_steps: 4000
seed: 0
sharded_ddp: false
sort_batch: descending
sort_in_batch: descending
token_list:
- <blank>
- <unk>
- <space>
- a
- r
- n
- "\u091F"
- i
- "\u0921"
- E
- s
- l
- d
- w
- I
- m
- k
- z
- f
- "\u0905"
- h
- "\u0911"
- U
- A
- .
- "\u0910"
- ','
- p
- b
- "\u0919"
- o
- g
- y
- "\u0936"
- "\u0914"
- t
- u
- c
- j
- '?'
- '!'
- q
- "\u0923"
- "\u0927"
- "\u0925"
- "\u0937"
- B
- H
- P
- D
- M
- v
- C
- R
- "\u0918"
- "\u0916"
- <sos/eos>
token_type: char
train_data_path_and_name_and_type:
- - dump/raw/tr_no_dev/text
  - text
  - text
- - duration_info_from_teacher/decode_use_teacher_forcingtrue_train.loss.ave/tr_no_dev/durations
  - durations
  - text_int
- - dump/raw/tr_no_dev/wav.scp
  - speech
  - sound
train_dtype: float32
train_shape_file:
- exp/tts_stats_raw_char_None/train/text_shape.char
- exp/tts_stats_raw_char_None/train/speech_shape
tts: fastspeech2
tts_conf:
  adim: 384
  aheads: 2
  decoder_normalize_before: true
  dlayers: 4
  dunits: 1536
  duration_predictor_chans: 256
  duration_predictor_kernel_size: 3
  duration_predictor_layers: 2
  elayers: 4
  encoder_normalize_before: true
  energy_embed_dropout: 0.0
  energy_embed_kernel_size: 1
  energy_predictor_chans: 256
  energy_predictor_dropout: 0.5
  energy_predictor_kernel_size: 3
  energy_predictor_layers: 2
  eunits: 1536
  init_dec_alpha: 1.0
  init_enc_alpha: 1.0
  init_type: xavier_uniform
  pitch_embed_dropout: 0.0
  pitch_embed_kernel_size: 1
  pitch_predictor_chans: 256
  pitch_predictor_dropout: 0.5
  pitch_predictor_kernel_size: 5
  pitch_predictor_layers: 5
  positionwise_conv_kernel_size: 3
  positionwise_layer_type: conv1d
  postnet_chans: 256
  postnet_filts: 5
  postnet_layers: 5
  reduction_factor: 1
  stop_gradient_from_energy_predictor: false
  stop_gradient_from_pitch_predictor: true
  transformer_dec_attn_dropout_rate: 0.2
  transformer_dec_dropout_rate: 0.2
  transformer_dec_positional_dropout_rate: 0.2
  transformer_enc_attn_dropout_rate: 0.2
  transformer_enc_dropout_rate: 0.2
  transformer_enc_positional_dropout_rate: 0.2
  use_masking: true
  use_scaled_pos_enc: true
unused_parameters: false
use_amp: false
use_preprocessor: true
use_tensorboard: true
use_wandb: false
val_scheduler_criterion:
- valid
- loss
valid_batch_bins: null
valid_batch_size: null
valid_batch_type: null
valid_data_path_and_name_and_type:
- - dump/raw/dev/text
  - text
  - text
- - duration_info_from_teacher/decode_use_teacher_forcingtrue_train.loss.ave/dev/durations
  - durations
  - text_int
- - dump/raw/dev/wav.scp
  - speech
  - sound
valid_max_cache_size: null
valid_shape_file:
- exp/tts_stats_raw_char_None/valid/text_shape.char
- exp/tts_stats_raw_char_None/valid/speech_shape
version: 0.10.3a3
wandb_entity: null
wandb_id: null
wandb_model_log_interval: -1
wandb_name: null
wandb_project: null
write_collected_feats: false