tts-fastspeech2-ljspeech / hyperparams.yaml
yingzhi's picture
add spn_predictor
cc4865c
raw
history blame
3.39 kB
# ################################
# Model: Fastspeech2 for TTS
# Authors: Sathvik Udupa, Yingzhi Wang, Pradnya Kandarkar
# ################################
# Input parameters
lexicon:
- AA
- AE
- AH
- AO
- AW
- AY
- B
- CH
- D
- DH
- EH
- ER
- EY
- F
- G
- HH
- IH
- IY
- JH
- K
- L
- M
- N
- NG
- OW
- OY
- P
- R
- S
- SH
- T
- TH
- UH
- UW
- V
- W
- Y
- Z
- ZH
- spn
n_symbols: 42 #fixed deppending on symbols in the lexicon +1 for a dummy symbol used for padding
padding_idx: 0
n_mel_channels: 80
# Encoder parameters
enc_num_layers: 4
enc_num_head: 2
enc_d_model: 384
enc_ffn_dim: 1024
enc_k_dim: 384
enc_v_dim: 384
enc_dropout: 0.2
# Decoder parameters
dec_num_layers: 4
dec_num_head: 2
dec_d_model: 384
dec_ffn_dim: 1024
dec_k_dim: 384
dec_v_dim: 384
dec_dropout: 0.2
# Postnet parameters
postnet_embedding_dim: 512
postnet_kernel_size: 5
postnet_n_convolutions: 5
postnet_dropout: 0.5
# Common
normalize_before: True
ffn_type: 1dcnn #1dcnn or ffn
ffn_cnn_kernel_size_list: [9, 1]
# Variance predictor
dur_pred_kernel_size: 3
pitch_pred_kernel_size: 3
energy_pred_kernel_size: 3
variance_predictor_dropout: 0.5
# silent phoneme token predictor
spn_predictor: !new:speechbrain.lobes.models.FastSpeech2.SPNPredictor
enc_num_layers: !ref <enc_num_layers>
enc_num_head: !ref <enc_num_head>
enc_d_model: !ref <enc_d_model>
enc_ffn_dim: !ref <enc_ffn_dim>
enc_k_dim: !ref <enc_k_dim>
enc_v_dim: !ref <enc_v_dim>
enc_dropout: !ref <enc_dropout>
normalize_before: !ref <normalize_before>
ffn_type: !ref <ffn_type>
ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list>
n_char: !ref <n_symbols>
padding_idx: !ref <padding_idx>
# Model
model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2
enc_num_layers: !ref <enc_num_layers>
enc_num_head: !ref <enc_num_head>
enc_d_model: !ref <enc_d_model>
enc_ffn_dim: !ref <enc_ffn_dim>
enc_k_dim: !ref <enc_k_dim>
enc_v_dim: !ref <enc_v_dim>
enc_dropout: !ref <enc_dropout>
dec_num_layers: !ref <dec_num_layers>
dec_num_head: !ref <dec_num_head>
dec_d_model: !ref <dec_d_model>
dec_ffn_dim: !ref <dec_ffn_dim>
dec_k_dim: !ref <dec_k_dim>
dec_v_dim: !ref <dec_v_dim>
dec_dropout: !ref <dec_dropout>
normalize_before: !ref <normalize_before>
ffn_type: !ref <ffn_type>
ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list>
n_char: !ref <n_symbols>
n_mels: !ref <n_mel_channels>
postnet_embedding_dim: !ref <postnet_embedding_dim>
postnet_kernel_size: !ref <postnet_kernel_size>
postnet_n_convolutions: !ref <postnet_n_convolutions>
postnet_dropout: !ref <postnet_dropout>
padding_idx: !ref <padding_idx>
dur_pred_kernel_size: !ref <dur_pred_kernel_size>
pitch_pred_kernel_size: !ref <pitch_pred_kernel_size>
energy_pred_kernel_size: !ref <energy_pred_kernel_size>
variance_predictor_dropout: !ref <variance_predictor_dropout>
input_encoder: !new:speechbrain.dataio.encoder.TextEncoder
modules:
spn_predictor: !ref <spn_predictor>
model: !ref <model>
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
spn_predictor: !ref <spn_predictor>
model: !ref <model>