File size: 3,390 Bytes
d8714d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc4865c
d8714d0
 
 
 
 
 
cc4865c
d8714d0
cc4865c
 
d8714d0
 
 
 
 
cc4865c
d8714d0
cc4865c
 
d8714d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc4865c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8714d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc4865c
d8714d0
 
 
 
cc4865c
d8714d0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# ################################
# Model: Fastspeech2 for TTS
# Authors: Sathvik Udupa, Yingzhi Wang, Pradnya Kandarkar 
# ################################
# Input parameters
lexicon:
    - AA
    - AE
    - AH
    - AO
    - AW
    - AY
    - B
    - CH
    - D
    - DH
    - EH
    - ER
    - EY
    - F
    - G
    - HH
    - IH
    - IY
    - JH
    - K
    - L
    - M
    - N
    - NG
    - OW
    - OY
    - P
    - R
    - S
    - SH
    - T
    - TH
    - UH
    - UW
    - V
    - W
    - Y
    - Z
    - ZH
    - spn

n_symbols: 42 #fixed deppending on symbols in the lexicon +1 for a dummy symbol used for padding
padding_idx: 0
n_mel_channels: 80

# Encoder parameters
enc_num_layers: 4
enc_num_head: 2
enc_d_model: 384
enc_ffn_dim: 1024
enc_k_dim: 384
enc_v_dim: 384
enc_dropout: 0.2

# Decoder parameters
dec_num_layers: 4
dec_num_head: 2
dec_d_model: 384
dec_ffn_dim: 1024
dec_k_dim: 384
dec_v_dim: 384
dec_dropout: 0.2

# Postnet parameters
postnet_embedding_dim: 512
postnet_kernel_size: 5
postnet_n_convolutions: 5
postnet_dropout: 0.5

# Common
normalize_before: True
ffn_type: 1dcnn #1dcnn or ffn
ffn_cnn_kernel_size_list: [9, 1]

# Variance predictor
dur_pred_kernel_size: 3
pitch_pred_kernel_size: 3
energy_pred_kernel_size: 3
variance_predictor_dropout: 0.5

# silent phoneme token predictor
spn_predictor: !new:speechbrain.lobes.models.FastSpeech2.SPNPredictor
    enc_num_layers: !ref <enc_num_layers>
    enc_num_head: !ref <enc_num_head>
    enc_d_model: !ref <enc_d_model>
    enc_ffn_dim: !ref <enc_ffn_dim>
    enc_k_dim: !ref <enc_k_dim>
    enc_v_dim: !ref <enc_v_dim>
    enc_dropout: !ref <enc_dropout>
    normalize_before: !ref <normalize_before>
    ffn_type: !ref <ffn_type>
    ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list>
    n_char: !ref <n_symbols>
    padding_idx: !ref <padding_idx>

# Model
model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2
    enc_num_layers: !ref <enc_num_layers>
    enc_num_head: !ref <enc_num_head>
    enc_d_model: !ref <enc_d_model>
    enc_ffn_dim: !ref <enc_ffn_dim>
    enc_k_dim: !ref <enc_k_dim>
    enc_v_dim: !ref <enc_v_dim>
    enc_dropout: !ref <enc_dropout>
    dec_num_layers: !ref <dec_num_layers>
    dec_num_head: !ref <dec_num_head>
    dec_d_model: !ref <dec_d_model>
    dec_ffn_dim: !ref <dec_ffn_dim>
    dec_k_dim: !ref <dec_k_dim>
    dec_v_dim: !ref <dec_v_dim>
    dec_dropout: !ref <dec_dropout>
    normalize_before: !ref <normalize_before>
    ffn_type: !ref <ffn_type>
    ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list>
    n_char: !ref <n_symbols>
    n_mels: !ref <n_mel_channels>
    postnet_embedding_dim: !ref <postnet_embedding_dim>
    postnet_kernel_size: !ref <postnet_kernel_size>
    postnet_n_convolutions: !ref <postnet_n_convolutions>
    postnet_dropout: !ref <postnet_dropout>
    padding_idx: !ref <padding_idx>
    dur_pred_kernel_size: !ref <dur_pred_kernel_size>
    pitch_pred_kernel_size: !ref <pitch_pred_kernel_size>
    energy_pred_kernel_size: !ref <energy_pred_kernel_size>
    variance_predictor_dropout: !ref <variance_predictor_dropout>


input_encoder: !new:speechbrain.dataio.encoder.TextEncoder

modules:
    spn_predictor: !ref <spn_predictor>
    model: !ref <model>

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
    loadables:
        spn_predictor: !ref <spn_predictor>
        model: !ref <model>