File size: 2,747 Bytes
d8714d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# ################################
# Model: Fastspeech2 for TTS
# Authors: Sathvik Udupa, Yingzhi Wang, Pradnya Kandarkar 
# ################################
# Input parameters
lexicon:
    - AA
    - AE
    - AH
    - AO
    - AW
    - AY
    - B
    - CH
    - D
    - DH
    - EH
    - ER
    - EY
    - F
    - G
    - HH
    - IH
    - IY
    - JH
    - K
    - L
    - M
    - N
    - NG
    - OW
    - OY
    - P
    - R
    - S
    - SH
    - T
    - TH
    - UH
    - UW
    - V
    - W
    - Y
    - Z
    - ZH
    - spn

n_symbols: 41 #fixed deppending on symbols in the lexicon +1 for a dummy symbol used for padding
padding_idx: 0
n_mel_channels: 80

# Encoder parameters
enc_num_layers: 4
enc_num_head: 2
enc_d_model: 256
enc_ffn_dim: 1024
enc_k_dim: 256
enc_v_dim: 256
enc_dropout: 0.2

# Decoder parameters
dec_num_layers: 4
dec_num_head: 2
dec_d_model: 256
dec_ffn_dim: 1024
dec_k_dim: 256
dec_v_dim: 256
dec_dropout: 0.2

# Postnet parameters
postnet_embedding_dim: 512
postnet_kernel_size: 5
postnet_n_convolutions: 5
postnet_dropout: 0.5

# Common
normalize_before: True
ffn_type: 1dcnn #1dcnn or ffn
ffn_cnn_kernel_size_list: [9, 1]

# Variance predictor
dur_pred_kernel_size: 3
pitch_pred_kernel_size: 3
energy_pred_kernel_size: 3
variance_predictor_dropout: 0.5

# Model
model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2
    enc_num_layers: !ref <enc_num_layers>
    enc_num_head: !ref <enc_num_head>
    enc_d_model: !ref <enc_d_model>
    enc_ffn_dim: !ref <enc_ffn_dim>
    enc_k_dim: !ref <enc_k_dim>
    enc_v_dim: !ref <enc_v_dim>
    enc_dropout: !ref <enc_dropout>
    dec_num_layers: !ref <dec_num_layers>
    dec_num_head: !ref <dec_num_head>
    dec_d_model: !ref <dec_d_model>
    dec_ffn_dim: !ref <dec_ffn_dim>
    dec_k_dim: !ref <dec_k_dim>
    dec_v_dim: !ref <dec_v_dim>
    dec_dropout: !ref <dec_dropout>
    normalize_before: !ref <normalize_before>
    ffn_type: !ref <ffn_type>
    ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list>
    n_char: !ref <n_symbols>
    n_mels: !ref <n_mel_channels>
    postnet_embedding_dim: !ref <postnet_embedding_dim>
    postnet_kernel_size: !ref <postnet_kernel_size>
    postnet_n_convolutions: !ref <postnet_n_convolutions>
    postnet_dropout: !ref <postnet_dropout>
    padding_idx: !ref <padding_idx>
    dur_pred_kernel_size: !ref <dur_pred_kernel_size>
    pitch_pred_kernel_size: !ref <pitch_pred_kernel_size>
    energy_pred_kernel_size: !ref <energy_pred_kernel_size>
    variance_predictor_dropout: !ref <variance_predictor_dropout>


input_encoder: !new:speechbrain.dataio.encoder.TextEncoder

modules:
    model: !ref <model>

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
    loadables:
        model: !ref <model>