nonoJDWAOIDAWKDA commited on
Commit
39065c6
·
verified ·
1 Parent(s): 33d071c

Upload StyleTTS2 checkpoint epoch_2nd_00009.pth with all inference components

Browse files
.gitattributes CHANGED
@@ -1,35 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.pth filter=lfs diff=lfs merge=lfs -text
2
+ *.t7 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
README.md ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ tags:
4
+ - text-to-speech
5
+ - StyleTTS2
6
+ - speech-synthesis
7
+ license: mit
8
+ pipeline_tag: text-to-speech
9
+ ---
10
+
11
+ # StyleTTS2 Fine-tuned Model
12
+
13
+ This model is a fine-tuned version of StyleTTS2, containing all necessary components for inference.
14
+
15
+ ## Model Details
16
+ - **Base Model:** StyleTTS2-LibriTTS
17
+ - **Architecture:** StyleTTS2
18
+ - **Task:** Text-to-Speech
19
+ - **Last Checkpoint:** epoch_2nd_00009.pth
20
+
21
+ ## Training Details
22
+ - **Total Epochs:** 10
23
+ - **Completed Epochs:** 9
24
+ - **Total Iterations:** 4080
25
+ - **Batch Size:** 2
26
+ - **Max Length:** 630
27
+ - **Learning Rate:** 0.0001
28
+ - **Final Validation Loss:** 0.407607
29
+
30
+ ## Model Components
31
+ The repository includes all necessary components for inference:
32
+
33
+ ### Main Model Components:
34
+ - bert.pth
35
+ - bert_encoder.pth
36
+ - predictor.pth
37
+ - decoder.pth
38
+ - text_encoder.pth
39
+ - predictor_encoder.pth
40
+ - style_encoder.pth
41
+ - diffusion.pth
42
+ - text_aligner.pth
43
+ - pitch_extractor.pth
44
+ - mpd.pth
45
+ - msd.pth
46
+ - wd.pth
47
+
48
+ ### Utility Components:
49
+ - ASR (Automatic Speech Recognition)
50
+ - epoch_00080.pth
51
+ - config.yml
52
+ - models.py
53
+ - layers.py
54
+ - JDC (F0 Prediction)
55
+ - bst.t7
56
+ - model.py
57
+ - PLBERT
58
+ - step_1000000.t7
59
+ - config.yml
60
+ - util.py
61
+
62
+ ### Additional Files:
63
+ - text_utils.py: Text preprocessing utilities
64
+ - models.py: Model architecture definitions
65
+ - utils.py: Utility functions
66
+ - config.yml: Model configuration
67
+ - config.json: Detailed configuration and training metrics
68
+
69
+ ## Training Metrics
70
+ Training metrics visualization is available in training_metrics.png
71
+
72
+ ## Directory Structure
73
+ ├── Utils/
74
+ │ ├── ASR/
75
+ │ ├── JDC/
76
+ │ └── PLBERT/
77
+ ├── model_components/
78
+ └── configs/
79
+
80
+ ## Usage Instructions
81
+ 1. Load the model using the provided config.yml
82
+ 2. Ensure all utility components (ASR, JDC, PLBERT) are in their respective directories
83
+ 3. Use text_utils.py for text preprocessing
84
+ 4. Follow the inference example in the StyleTTS2 documentation
Utils/ASR/config.yml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "logs/20201006"
2
+ save_freq: 5
3
+ device: "cuda"
4
+ epochs: 180
5
+ batch_size: 64
6
+ pretrained_model: ""
7
+ train_data: "ASRDataset/train_list.txt"
8
+ val_data: "ASRDataset/val_list.txt"
9
+
10
+ dataset_params:
11
+ data_augmentation: false
12
+
13
+ preprocess_parasm:
14
+ sr: 24000
15
+ spect_params:
16
+ n_fft: 2048
17
+ win_length: 1200
18
+ hop_length: 300
19
+ mel_params:
20
+ n_mels: 80
21
+
22
+ model_params:
23
+ input_dim: 80
24
+ hidden_dim: 256
25
+ n_token: 178
26
+ token_embedding_dim: 512
27
+
28
+ optimizer_params:
29
+ lr: 0.0005
Utils/ASR/epoch_00080.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fedd55a1234b0c56e1e8b509c74edf3a5e2f27106a66038a4a946047a775bd6c
3
+ size 94552811
Utils/ASR/layers.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from typing import Optional, Any
5
+ from torch import Tensor
6
+ import torch.nn.functional as F
7
+ import torchaudio
8
+ import torchaudio.functional as audio_F
9
+
10
+ import random
11
+ random.seed(0)
12
+
13
+
14
+ def _get_activation_fn(activ):
15
+ if activ == 'relu':
16
+ return nn.ReLU()
17
+ elif activ == 'lrelu':
18
+ return nn.LeakyReLU(0.2)
19
+ elif activ == 'swish':
20
+ return lambda x: x*torch.sigmoid(x)
21
+ else:
22
+ raise RuntimeError('Unexpected activ type %s, expected [relu, lrelu, swish]' % activ)
23
+
24
+ class LinearNorm(torch.nn.Module):
25
+ def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
26
+ super(LinearNorm, self).__init__()
27
+ self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
28
+
29
+ torch.nn.init.xavier_uniform_(
30
+ self.linear_layer.weight,
31
+ gain=torch.nn.init.calculate_gain(w_init_gain))
32
+
33
+ def forward(self, x):
34
+ return self.linear_layer(x)
35
+
36
+
37
+ class ConvNorm(torch.nn.Module):
38
+ def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
39
+ padding=None, dilation=1, bias=True, w_init_gain='linear', param=None):
40
+ super(ConvNorm, self).__init__()
41
+ if padding is None:
42
+ assert(kernel_size % 2 == 1)
43
+ padding = int(dilation * (kernel_size - 1) / 2)
44
+
45
+ self.conv = torch.nn.Conv1d(in_channels, out_channels,
46
+ kernel_size=kernel_size, stride=stride,
47
+ padding=padding, dilation=dilation,
48
+ bias=bias)
49
+
50
+ torch.nn.init.xavier_uniform_(
51
+ self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain, param=param))
52
+
53
+ def forward(self, signal):
54
+ conv_signal = self.conv(signal)
55
+ return conv_signal
56
+
57
+ class CausualConv(nn.Module):
58
+ def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=1, dilation=1, bias=True, w_init_gain='linear', param=None):
59
+ super(CausualConv, self).__init__()
60
+ if padding is None:
61
+ assert(kernel_size % 2 == 1)
62
+ padding = int(dilation * (kernel_size - 1) / 2) * 2
63
+ else:
64
+ self.padding = padding * 2
65
+ self.conv = nn.Conv1d(in_channels, out_channels,
66
+ kernel_size=kernel_size, stride=stride,
67
+ padding=self.padding,
68
+ dilation=dilation,
69
+ bias=bias)
70
+
71
+ torch.nn.init.xavier_uniform_(
72
+ self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain, param=param))
73
+
74
+ def forward(self, x):
75
+ x = self.conv(x)
76
+ x = x[:, :, :-self.padding]
77
+ return x
78
+
79
+ class CausualBlock(nn.Module):
80
+ def __init__(self, hidden_dim, n_conv=3, dropout_p=0.2, activ='lrelu'):
81
+ super(CausualBlock, self).__init__()
82
+ self.blocks = nn.ModuleList([
83
+ self._get_conv(hidden_dim, dilation=3**i, activ=activ, dropout_p=dropout_p)
84
+ for i in range(n_conv)])
85
+
86
+ def forward(self, x):
87
+ for block in self.blocks:
88
+ res = x
89
+ x = block(x)
90
+ x += res
91
+ return x
92
+
93
+ def _get_conv(self, hidden_dim, dilation, activ='lrelu', dropout_p=0.2):
94
+ layers = [
95
+ CausualConv(hidden_dim, hidden_dim, kernel_size=3, padding=dilation, dilation=dilation),
96
+ _get_activation_fn(activ),
97
+ nn.BatchNorm1d(hidden_dim),
98
+ nn.Dropout(p=dropout_p),
99
+ CausualConv(hidden_dim, hidden_dim, kernel_size=3, padding=1, dilation=1),
100
+ _get_activation_fn(activ),
101
+ nn.Dropout(p=dropout_p)
102
+ ]
103
+ return nn.Sequential(*layers)
104
+
105
+ class ConvBlock(nn.Module):
106
+ def __init__(self, hidden_dim, n_conv=3, dropout_p=0.2, activ='relu'):
107
+ super().__init__()
108
+ self._n_groups = 8
109
+ self.blocks = nn.ModuleList([
110
+ self._get_conv(hidden_dim, dilation=3**i, activ=activ, dropout_p=dropout_p)
111
+ for i in range(n_conv)])
112
+
113
+
114
+ def forward(self, x):
115
+ for block in self.blocks:
116
+ res = x
117
+ x = block(x)
118
+ x += res
119
+ return x
120
+
121
+ def _get_conv(self, hidden_dim, dilation, activ='relu', dropout_p=0.2):
122
+ layers = [
123
+ ConvNorm(hidden_dim, hidden_dim, kernel_size=3, padding=dilation, dilation=dilation),
124
+ _get_activation_fn(activ),
125
+ nn.GroupNorm(num_groups=self._n_groups, num_channels=hidden_dim),
126
+ nn.Dropout(p=dropout_p),
127
+ ConvNorm(hidden_dim, hidden_dim, kernel_size=3, padding=1, dilation=1),
128
+ _get_activation_fn(activ),
129
+ nn.Dropout(p=dropout_p)
130
+ ]
131
+ return nn.Sequential(*layers)
132
+
133
+ class LocationLayer(nn.Module):
134
+ def __init__(self, attention_n_filters, attention_kernel_size,
135
+ attention_dim):
136
+ super(LocationLayer, self).__init__()
137
+ padding = int((attention_kernel_size - 1) / 2)
138
+ self.location_conv = ConvNorm(2, attention_n_filters,
139
+ kernel_size=attention_kernel_size,
140
+ padding=padding, bias=False, stride=1,
141
+ dilation=1)
142
+ self.location_dense = LinearNorm(attention_n_filters, attention_dim,
143
+ bias=False, w_init_gain='tanh')
144
+
145
+ def forward(self, attention_weights_cat):
146
+ processed_attention = self.location_conv(attention_weights_cat)
147
+ processed_attention = processed_attention.transpose(1, 2)
148
+ processed_attention = self.location_dense(processed_attention)
149
+ return processed_attention
150
+
151
+
152
+ class Attention(nn.Module):
153
+ def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
154
+ attention_location_n_filters, attention_location_kernel_size):
155
+ super(Attention, self).__init__()
156
+ self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
157
+ bias=False, w_init_gain='tanh')
158
+ self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False,
159
+ w_init_gain='tanh')
160
+ self.v = LinearNorm(attention_dim, 1, bias=False)
161
+ self.location_layer = LocationLayer(attention_location_n_filters,
162
+ attention_location_kernel_size,
163
+ attention_dim)
164
+ self.score_mask_value = -float("inf")
165
+
166
+ def get_alignment_energies(self, query, processed_memory,
167
+ attention_weights_cat):
168
+ """
169
+ PARAMS
170
+ ------
171
+ query: decoder output (batch, n_mel_channels * n_frames_per_step)
172
+ processed_memory: processed encoder outputs (B, T_in, attention_dim)
173
+ attention_weights_cat: cumulative and prev. att weights (B, 2, max_time)
174
+ RETURNS
175
+ -------
176
+ alignment (batch, max_time)
177
+ """
178
+
179
+ processed_query = self.query_layer(query.unsqueeze(1))
180
+ processed_attention_weights = self.location_layer(attention_weights_cat)
181
+ energies = self.v(torch.tanh(
182
+ processed_query + processed_attention_weights + processed_memory))
183
+
184
+ energies = energies.squeeze(-1)
185
+ return energies
186
+
187
+ def forward(self, attention_hidden_state, memory, processed_memory,
188
+ attention_weights_cat, mask):
189
+ """
190
+ PARAMS
191
+ ------
192
+ attention_hidden_state: attention rnn last output
193
+ memory: encoder outputs
194
+ processed_memory: processed encoder outputs
195
+ attention_weights_cat: previous and cummulative attention weights
196
+ mask: binary mask for padded data
197
+ """
198
+ alignment = self.get_alignment_energies(
199
+ attention_hidden_state, processed_memory, attention_weights_cat)
200
+
201
+ if mask is not None:
202
+ alignment.data.masked_fill_(mask, self.score_mask_value)
203
+
204
+ attention_weights = F.softmax(alignment, dim=1)
205
+ attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
206
+ attention_context = attention_context.squeeze(1)
207
+
208
+ return attention_context, attention_weights
209
+
210
+
211
+ class ForwardAttentionV2(nn.Module):
212
+ def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
213
+ attention_location_n_filters, attention_location_kernel_size):
214
+ super(ForwardAttentionV2, self).__init__()
215
+ self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
216
+ bias=False, w_init_gain='tanh')
217
+ self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False,
218
+ w_init_gain='tanh')
219
+ self.v = LinearNorm(attention_dim, 1, bias=False)
220
+ self.location_layer = LocationLayer(attention_location_n_filters,
221
+ attention_location_kernel_size,
222
+ attention_dim)
223
+ self.score_mask_value = -float(1e20)
224
+
225
+ def get_alignment_energies(self, query, processed_memory,
226
+ attention_weights_cat):
227
+ """
228
+ PARAMS
229
+ ------
230
+ query: decoder output (batch, n_mel_channels * n_frames_per_step)
231
+ processed_memory: processed encoder outputs (B, T_in, attention_dim)
232
+ attention_weights_cat: prev. and cumulative att weights (B, 2, max_time)
233
+ RETURNS
234
+ -------
235
+ alignment (batch, max_time)
236
+ """
237
+
238
+ processed_query = self.query_layer(query.unsqueeze(1))
239
+ processed_attention_weights = self.location_layer(attention_weights_cat)
240
+ energies = self.v(torch.tanh(
241
+ processed_query + processed_attention_weights + processed_memory))
242
+
243
+ energies = energies.squeeze(-1)
244
+ return energies
245
+
246
+ def forward(self, attention_hidden_state, memory, processed_memory,
247
+ attention_weights_cat, mask, log_alpha):
248
+ """
249
+ PARAMS
250
+ ------
251
+ attention_hidden_state: attention rnn last output
252
+ memory: encoder outputs
253
+ processed_memory: processed encoder outputs
254
+ attention_weights_cat: previous and cummulative attention weights
255
+ mask: binary mask for padded data
256
+ """
257
+ log_energy = self.get_alignment_energies(
258
+ attention_hidden_state, processed_memory, attention_weights_cat)
259
+
260
+ #log_energy =
261
+
262
+ if mask is not None:
263
+ log_energy.data.masked_fill_(mask, self.score_mask_value)
264
+
265
+ #attention_weights = F.softmax(alignment, dim=1)
266
+
267
+ #content_score = log_energy.unsqueeze(1) #[B, MAX_TIME] -> [B, 1, MAX_TIME]
268
+ #log_alpha = log_alpha.unsqueeze(2) #[B, MAX_TIME] -> [B, MAX_TIME, 1]
269
+
270
+ #log_total_score = log_alpha + content_score
271
+
272
+ #previous_attention_weights = attention_weights_cat[:,0,:]
273
+
274
+ log_alpha_shift_padded = []
275
+ max_time = log_energy.size(1)
276
+ for sft in range(2):
277
+ shifted = log_alpha[:,:max_time-sft]
278
+ shift_padded = F.pad(shifted, (sft,0), 'constant', self.score_mask_value)
279
+ log_alpha_shift_padded.append(shift_padded.unsqueeze(2))
280
+
281
+ biased = torch.logsumexp(torch.cat(log_alpha_shift_padded,2), 2)
282
+
283
+ log_alpha_new = biased + log_energy
284
+
285
+ attention_weights = F.softmax(log_alpha_new, dim=1)
286
+
287
+ attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
288
+ attention_context = attention_context.squeeze(1)
289
+
290
+ return attention_context, attention_weights, log_alpha_new
291
+
292
+
293
+ class PhaseShuffle2d(nn.Module):
294
+ def __init__(self, n=2):
295
+ super(PhaseShuffle2d, self).__init__()
296
+ self.n = n
297
+ self.random = random.Random(1)
298
+
299
+ def forward(self, x, move=None):
300
+ # x.size = (B, C, M, L)
301
+ if move is None:
302
+ move = self.random.randint(-self.n, self.n)
303
+
304
+ if move == 0:
305
+ return x
306
+ else:
307
+ left = x[:, :, :, :move]
308
+ right = x[:, :, :, move:]
309
+ shuffled = torch.cat([right, left], dim=3)
310
+ return shuffled
311
+
312
+ class PhaseShuffle1d(nn.Module):
313
+ def __init__(self, n=2):
314
+ super(PhaseShuffle1d, self).__init__()
315
+ self.n = n
316
+ self.random = random.Random(1)
317
+
318
+ def forward(self, x, move=None):
319
+ # x.size = (B, C, M, L)
320
+ if move is None:
321
+ move = self.random.randint(-self.n, self.n)
322
+
323
+ if move == 0:
324
+ return x
325
+ else:
326
+ left = x[:, :, :move]
327
+ right = x[:, :, move:]
328
+ shuffled = torch.cat([right, left], dim=2)
329
+
330
+ return shuffled
331
+
332
+ class MFCC(nn.Module):
333
+ def __init__(self, n_mfcc=40, n_mels=80):
334
+ super(MFCC, self).__init__()
335
+ self.n_mfcc = n_mfcc
336
+ self.n_mels = n_mels
337
+ self.norm = 'ortho'
338
+ dct_mat = audio_F.create_dct(self.n_mfcc, self.n_mels, self.norm)
339
+ self.register_buffer('dct_mat', dct_mat)
340
+
341
+ def forward(self, mel_specgram):
342
+ if len(mel_specgram.shape) == 2:
343
+ mel_specgram = mel_specgram.unsqueeze(0)
344
+ unsqueezed = True
345
+ else:
346
+ unsqueezed = False
347
+ # (channel, n_mels, time).tranpose(...) dot (n_mels, n_mfcc)
348
+ # -> (channel, time, n_mfcc).tranpose(...)
349
+ mfcc = torch.matmul(mel_specgram.transpose(1, 2), self.dct_mat).transpose(1, 2)
350
+
351
+ # unpack batch
352
+ if unsqueezed:
353
+ mfcc = mfcc.squeeze(0)
354
+ return mfcc
Utils/ASR/models.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import TransformerEncoder
5
+ import torch.nn.functional as F
6
+ from .layers import MFCC, Attention, LinearNorm, ConvNorm, ConvBlock
7
+
8
+ class ASRCNN(nn.Module):
9
+ def __init__(self,
10
+ input_dim=80,
11
+ hidden_dim=256,
12
+ n_token=35,
13
+ n_layers=6,
14
+ token_embedding_dim=256,
15
+
16
+ ):
17
+ super().__init__()
18
+ self.n_token = n_token
19
+ self.n_down = 1
20
+ self.to_mfcc = MFCC()
21
+ self.init_cnn = ConvNorm(input_dim//2, hidden_dim, kernel_size=7, padding=3, stride=2)
22
+ self.cnns = nn.Sequential(
23
+ *[nn.Sequential(
24
+ ConvBlock(hidden_dim),
25
+ nn.GroupNorm(num_groups=1, num_channels=hidden_dim)
26
+ ) for n in range(n_layers)])
27
+ self.projection = ConvNorm(hidden_dim, hidden_dim // 2)
28
+ self.ctc_linear = nn.Sequential(
29
+ LinearNorm(hidden_dim//2, hidden_dim),
30
+ nn.ReLU(),
31
+ LinearNorm(hidden_dim, n_token))
32
+ self.asr_s2s = ASRS2S(
33
+ embedding_dim=token_embedding_dim,
34
+ hidden_dim=hidden_dim//2,
35
+ n_token=n_token)
36
+
37
+ def forward(self, x, src_key_padding_mask=None, text_input=None):
38
+ x = self.to_mfcc(x)
39
+ x = self.init_cnn(x)
40
+ x = self.cnns(x)
41
+ x = self.projection(x)
42
+ x = x.transpose(1, 2)
43
+ ctc_logit = self.ctc_linear(x)
44
+ if text_input is not None:
45
+ _, s2s_logit, s2s_attn = self.asr_s2s(x, src_key_padding_mask, text_input)
46
+ return ctc_logit, s2s_logit, s2s_attn
47
+ else:
48
+ return ctc_logit
49
+
50
+ def get_feature(self, x):
51
+ x = self.to_mfcc(x.squeeze(1))
52
+ x = self.init_cnn(x)
53
+ x = self.cnns(x)
54
+ x = self.projection(x)
55
+ return x
56
+
57
+ def length_to_mask(self, lengths):
58
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
59
+ mask = torch.gt(mask+1, lengths.unsqueeze(1)).to(lengths.device)
60
+ return mask
61
+
62
+ def get_future_mask(self, out_length, unmask_future_steps=0):
63
+ """
64
+ Args:
65
+ out_length (int): returned mask shape is (out_length, out_length).
66
+ unmask_futre_steps (int): unmasking future step size.
67
+ Return:
68
+ mask (torch.BoolTensor): mask future timesteps mask[i, j] = True if i > j + unmask_future_steps else False
69
+ """
70
+ index_tensor = torch.arange(out_length).unsqueeze(0).expand(out_length, -1)
71
+ mask = torch.gt(index_tensor, index_tensor.T + unmask_future_steps)
72
+ return mask
73
+
74
+ class ASRS2S(nn.Module):
75
+ def __init__(self,
76
+ embedding_dim=256,
77
+ hidden_dim=512,
78
+ n_location_filters=32,
79
+ location_kernel_size=63,
80
+ n_token=40):
81
+ super(ASRS2S, self).__init__()
82
+ self.embedding = nn.Embedding(n_token, embedding_dim)
83
+ val_range = math.sqrt(6 / hidden_dim)
84
+ self.embedding.weight.data.uniform_(-val_range, val_range)
85
+
86
+ self.decoder_rnn_dim = hidden_dim
87
+ self.project_to_n_symbols = nn.Linear(self.decoder_rnn_dim, n_token)
88
+ self.attention_layer = Attention(
89
+ self.decoder_rnn_dim,
90
+ hidden_dim,
91
+ hidden_dim,
92
+ n_location_filters,
93
+ location_kernel_size
94
+ )
95
+ self.decoder_rnn = nn.LSTMCell(self.decoder_rnn_dim + embedding_dim, self.decoder_rnn_dim)
96
+ self.project_to_hidden = nn.Sequential(
97
+ LinearNorm(self.decoder_rnn_dim * 2, hidden_dim),
98
+ nn.Tanh())
99
+ self.sos = 1
100
+ self.eos = 2
101
+
102
+ def initialize_decoder_states(self, memory, mask):
103
+ """
104
+ moemory.shape = (B, L, H) = (Batchsize, Maxtimestep, Hiddendim)
105
+ """
106
+ B, L, H = memory.shape
107
+ self.decoder_hidden = torch.zeros((B, self.decoder_rnn_dim)).type_as(memory)
108
+ self.decoder_cell = torch.zeros((B, self.decoder_rnn_dim)).type_as(memory)
109
+ self.attention_weights = torch.zeros((B, L)).type_as(memory)
110
+ self.attention_weights_cum = torch.zeros((B, L)).type_as(memory)
111
+ self.attention_context = torch.zeros((B, H)).type_as(memory)
112
+ self.memory = memory
113
+ self.processed_memory = self.attention_layer.memory_layer(memory)
114
+ self.mask = mask
115
+ self.unk_index = 3
116
+ self.random_mask = 0.1
117
+
118
+ def forward(self, memory, memory_mask, text_input):
119
+ """
120
+ moemory.shape = (B, L, H) = (Batchsize, Maxtimestep, Hiddendim)
121
+ moemory_mask.shape = (B, L, )
122
+ texts_input.shape = (B, T)
123
+ """
124
+ self.initialize_decoder_states(memory, memory_mask)
125
+ # text random mask
126
+ random_mask = (torch.rand(text_input.shape) < self.random_mask).to(text_input.device)
127
+ _text_input = text_input.clone()
128
+ _text_input.masked_fill_(random_mask, self.unk_index)
129
+ decoder_inputs = self.embedding(_text_input).transpose(0, 1) # -> [T, B, channel]
130
+ start_embedding = self.embedding(
131
+ torch.LongTensor([self.sos]*decoder_inputs.size(1)).to(decoder_inputs.device))
132
+ decoder_inputs = torch.cat((start_embedding.unsqueeze(0), decoder_inputs), dim=0)
133
+
134
+ hidden_outputs, logit_outputs, alignments = [], [], []
135
+ while len(hidden_outputs) < decoder_inputs.size(0):
136
+
137
+ decoder_input = decoder_inputs[len(hidden_outputs)]
138
+ hidden, logit, attention_weights = self.decode(decoder_input)
139
+ hidden_outputs += [hidden]
140
+ logit_outputs += [logit]
141
+ alignments += [attention_weights]
142
+
143
+ hidden_outputs, logit_outputs, alignments = \
144
+ self.parse_decoder_outputs(
145
+ hidden_outputs, logit_outputs, alignments)
146
+
147
+ return hidden_outputs, logit_outputs, alignments
148
+
149
+
150
+ def decode(self, decoder_input):
151
+
152
+ cell_input = torch.cat((decoder_input, self.attention_context), -1)
153
+ self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
154
+ cell_input,
155
+ (self.decoder_hidden, self.decoder_cell))
156
+
157
+ attention_weights_cat = torch.cat(
158
+ (self.attention_weights.unsqueeze(1),
159
+ self.attention_weights_cum.unsqueeze(1)),dim=1)
160
+
161
+ self.attention_context, self.attention_weights = self.attention_layer(
162
+ self.decoder_hidden,
163
+ self.memory,
164
+ self.processed_memory,
165
+ attention_weights_cat,
166
+ self.mask)
167
+
168
+ self.attention_weights_cum += self.attention_weights
169
+
170
+ hidden_and_context = torch.cat((self.decoder_hidden, self.attention_context), -1)
171
+ hidden = self.project_to_hidden(hidden_and_context)
172
+
173
+ # dropout to increasing g
174
+ logit = self.project_to_n_symbols(F.dropout(hidden, 0.5, self.training))
175
+
176
+ return hidden, logit, self.attention_weights
177
+
178
+ def parse_decoder_outputs(self, hidden, logit, alignments):
179
+
180
+ # -> [B, T_out + 1, max_time]
181
+ alignments = torch.stack(alignments).transpose(0,1)
182
+ # [T_out + 1, B, n_symbols] -> [B, T_out + 1, n_symbols]
183
+ logit = torch.stack(logit).transpose(0, 1).contiguous()
184
+ hidden = torch.stack(hidden).transpose(0, 1).contiguous()
185
+
186
+ return hidden, logit, alignments
Utils/JDC/bst.t7 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54dc94364b97e18ac1dfa6287714ed121248cfaac4cfd39d061c6e0a089ef169
3
+ size 21029926
Utils/JDC/model.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Implementation of model from:
3
+ Kum et al. - "Joint Detection and Classification of Singing Voice Melody Using
4
+ Convolutional Recurrent Neural Networks" (2019)
5
+ Link: https://www.semanticscholar.org/paper/Joint-Detection-and-Classification-of-Singing-Voice-Kum-Nam/60a2ad4c7db43bace75805054603747fcd062c0d
6
+ """
7
+ import torch
8
+ from torch import nn
9
+
10
+ class JDCNet(nn.Module):
11
+ """
12
+ Joint Detection and Classification Network model for singing voice melody.
13
+ """
14
+ def __init__(self, num_class=722, seq_len=31, leaky_relu_slope=0.01):
15
+ super().__init__()
16
+ self.num_class = num_class
17
+
18
+ # input = (b, 1, 31, 513), b = batch size
19
+ self.conv_block = nn.Sequential(
20
+ nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, padding=1, bias=False), # out: (b, 64, 31, 513)
21
+ nn.BatchNorm2d(num_features=64),
22
+ nn.LeakyReLU(leaky_relu_slope, inplace=True),
23
+ nn.Conv2d(64, 64, 3, padding=1, bias=False), # (b, 64, 31, 513)
24
+ )
25
+
26
+ # res blocks
27
+ self.res_block1 = ResBlock(in_channels=64, out_channels=128) # (b, 128, 31, 128)
28
+ self.res_block2 = ResBlock(in_channels=128, out_channels=192) # (b, 192, 31, 32)
29
+ self.res_block3 = ResBlock(in_channels=192, out_channels=256) # (b, 256, 31, 8)
30
+
31
+ # pool block
32
+ self.pool_block = nn.Sequential(
33
+ nn.BatchNorm2d(num_features=256),
34
+ nn.LeakyReLU(leaky_relu_slope, inplace=True),
35
+ nn.MaxPool2d(kernel_size=(1, 4)), # (b, 256, 31, 2)
36
+ nn.Dropout(p=0.2),
37
+ )
38
+
39
+ # maxpool layers (for auxiliary network inputs)
40
+ # in = (b, 128, 31, 513) from conv_block, out = (b, 128, 31, 2)
41
+ self.maxpool1 = nn.MaxPool2d(kernel_size=(1, 40))
42
+ # in = (b, 128, 31, 128) from res_block1, out = (b, 128, 31, 2)
43
+ self.maxpool2 = nn.MaxPool2d(kernel_size=(1, 20))
44
+ # in = (b, 128, 31, 32) from res_block2, out = (b, 128, 31, 2)
45
+ self.maxpool3 = nn.MaxPool2d(kernel_size=(1, 10))
46
+
47
+ # in = (b, 640, 31, 2), out = (b, 256, 31, 2)
48
+ self.detector_conv = nn.Sequential(
49
+ nn.Conv2d(640, 256, 1, bias=False),
50
+ nn.BatchNorm2d(256),
51
+ nn.LeakyReLU(leaky_relu_slope, inplace=True),
52
+ nn.Dropout(p=0.2),
53
+ )
54
+
55
+ # input: (b, 31, 512) - resized from (b, 256, 31, 2)
56
+ self.bilstm_classifier = nn.LSTM(
57
+ input_size=512, hidden_size=256,
58
+ batch_first=True, bidirectional=True) # (b, 31, 512)
59
+
60
+ # input: (b, 31, 512) - resized from (b, 256, 31, 2)
61
+ self.bilstm_detector = nn.LSTM(
62
+ input_size=512, hidden_size=256,
63
+ batch_first=True, bidirectional=True) # (b, 31, 512)
64
+
65
+ # input: (b * 31, 512)
66
+ self.classifier = nn.Linear(in_features=512, out_features=self.num_class) # (b * 31, num_class)
67
+
68
+ # input: (b * 31, 512)
69
+ self.detector = nn.Linear(in_features=512, out_features=2) # (b * 31, 2) - binary classifier
70
+
71
+ # initialize weights
72
+ self.apply(self.init_weights)
73
+
74
+ def get_feature_GAN(self, x):
75
+ seq_len = x.shape[-2]
76
+ x = x.float().transpose(-1, -2)
77
+
78
+ convblock_out = self.conv_block(x)
79
+
80
+ resblock1_out = self.res_block1(convblock_out)
81
+ resblock2_out = self.res_block2(resblock1_out)
82
+ resblock3_out = self.res_block3(resblock2_out)
83
+ poolblock_out = self.pool_block[0](resblock3_out)
84
+ poolblock_out = self.pool_block[1](poolblock_out)
85
+
86
+ return poolblock_out.transpose(-1, -2)
87
+
88
+ def get_feature(self, x):
89
+ seq_len = x.shape[-2]
90
+ x = x.float().transpose(-1, -2)
91
+
92
+ convblock_out = self.conv_block(x)
93
+
94
+ resblock1_out = self.res_block1(convblock_out)
95
+ resblock2_out = self.res_block2(resblock1_out)
96
+ resblock3_out = self.res_block3(resblock2_out)
97
+ poolblock_out = self.pool_block[0](resblock3_out)
98
+ poolblock_out = self.pool_block[1](poolblock_out)
99
+
100
+ return self.pool_block[2](poolblock_out)
101
+
102
+ def forward(self, x):
103
+ """
104
+ Returns:
105
+ classification_prediction, detection_prediction
106
+ sizes: (b, 31, 722), (b, 31, 2)
107
+ """
108
+ ###############################
109
+ # forward pass for classifier #
110
+ ###############################
111
+ seq_len = x.shape[-1]
112
+ x = x.float().transpose(-1, -2)
113
+
114
+ convblock_out = self.conv_block(x)
115
+
116
+ resblock1_out = self.res_block1(convblock_out)
117
+ resblock2_out = self.res_block2(resblock1_out)
118
+ resblock3_out = self.res_block3(resblock2_out)
119
+
120
+
121
+ poolblock_out = self.pool_block[0](resblock3_out)
122
+ poolblock_out = self.pool_block[1](poolblock_out)
123
+ GAN_feature = poolblock_out.transpose(-1, -2)
124
+ poolblock_out = self.pool_block[2](poolblock_out)
125
+
126
+ # (b, 256, 31, 2) => (b, 31, 256, 2) => (b, 31, 512)
127
+ classifier_out = poolblock_out.permute(0, 2, 1, 3).contiguous().view((-1, seq_len, 512))
128
+ classifier_out, _ = self.bilstm_classifier(classifier_out) # ignore the hidden states
129
+
130
+ classifier_out = classifier_out.contiguous().view((-1, 512)) # (b * 31, 512)
131
+ classifier_out = self.classifier(classifier_out)
132
+ classifier_out = classifier_out.view((-1, seq_len, self.num_class)) # (b, 31, num_class)
133
+
134
+ # sizes: (b, 31, 722), (b, 31, 2)
135
+ # classifier output consists of predicted pitch classes per frame
136
+ # detector output consists of: (isvoice, notvoice) estimates per frame
137
+ return torch.abs(classifier_out.squeeze()), GAN_feature, poolblock_out
138
+
139
+ @staticmethod
140
+ def init_weights(m):
141
+ if isinstance(m, nn.Linear):
142
+ nn.init.kaiming_uniform_(m.weight)
143
+ if m.bias is not None:
144
+ nn.init.constant_(m.bias, 0)
145
+ elif isinstance(m, nn.Conv2d):
146
+ nn.init.xavier_normal_(m.weight)
147
+ elif isinstance(m, nn.LSTM) or isinstance(m, nn.LSTMCell):
148
+ for p in m.parameters():
149
+ if p.data is None:
150
+ continue
151
+
152
+ if len(p.shape) >= 2:
153
+ nn.init.orthogonal_(p.data)
154
+ else:
155
+ nn.init.normal_(p.data)
156
+
157
+
158
+ class ResBlock(nn.Module):
159
+ def __init__(self, in_channels: int, out_channels: int, leaky_relu_slope=0.01):
160
+ super().__init__()
161
+ self.downsample = in_channels != out_channels
162
+
163
+ # BN / LReLU / MaxPool layer before the conv layer - see Figure 1b in the paper
164
+ self.pre_conv = nn.Sequential(
165
+ nn.BatchNorm2d(num_features=in_channels),
166
+ nn.LeakyReLU(leaky_relu_slope, inplace=True),
167
+ nn.MaxPool2d(kernel_size=(1, 2)), # apply downsampling on the y axis only
168
+ )
169
+
170
+ # conv layers
171
+ self.conv = nn.Sequential(
172
+ nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
173
+ kernel_size=3, padding=1, bias=False),
174
+ nn.BatchNorm2d(out_channels),
175
+ nn.LeakyReLU(leaky_relu_slope, inplace=True),
176
+ nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
177
+ )
178
+
179
+ # 1 x 1 convolution layer to match the feature dimensions
180
+ self.conv1by1 = None
181
+ if self.downsample:
182
+ self.conv1by1 = nn.Conv2d(in_channels, out_channels, 1, bias=False)
183
+
184
+ def forward(self, x):
185
+ x = self.pre_conv(x)
186
+ if self.downsample:
187
+ x = self.conv(x) + self.conv1by1(x)
188
+ else:
189
+ x = self.conv(x) + x
190
+ return x
Utils/PLBERT/config.yml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: "Checkpoint"
2
+ mixed_precision: "fp16"
3
+ data_folder: "wikipedia_20220301.en.processed"
4
+ batch_size: 192
5
+ save_interval: 5000
6
+ log_interval: 10
7
+ num_process: 1 # number of GPUs
8
+ num_steps: 1000000
9
+
10
+ dataset_params:
11
+ tokenizer: "transfo-xl-wt103"
12
+ token_separator: " " # token used for phoneme separator (space)
13
+ token_mask: "M" # token used for phoneme mask (M)
14
+ word_separator: 3039 # token used for word separator (<formula>)
15
+ token_maps: "token_maps.pkl" # token map path
16
+
17
+ max_mel_length: 512 # max phoneme length
18
+
19
+ word_mask_prob: 0.15 # probability to mask the entire word
20
+ phoneme_mask_prob: 0.1 # probability to mask each phoneme
21
+ replace_prob: 0.2 # probablity to replace phonemes
22
+
23
+ model_params:
24
+ vocab_size: 178
25
+ hidden_size: 768
26
+ num_attention_heads: 12
27
+ intermediate_size: 2048
28
+ max_position_embeddings: 512
29
+ num_hidden_layers: 12
30
+ dropout: 0.1
Utils/PLBERT/step_1000000.t7 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0714ff85804db43e06b3b0ac5749bf90cf206257c6c5916e8a98c5933b4c21e0
3
+ size 25185187
Utils/PLBERT/util.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import yaml
3
+ import torch
4
+ from transformers import AlbertConfig, AlbertModel
5
+
6
+ class CustomAlbert(AlbertModel):
7
+ def forward(self, *args, **kwargs):
8
+ # Call the original forward method
9
+ outputs = super().forward(*args, **kwargs)
10
+
11
+ # Only return the last_hidden_state
12
+ return outputs.last_hidden_state
13
+
14
+
15
+ def load_plbert(log_dir):
16
+ config_path = os.path.join(log_dir, "config.yml")
17
+ plbert_config = yaml.safe_load(open(config_path))
18
+
19
+ albert_base_configuration = AlbertConfig(**plbert_config['model_params'])
20
+ bert = CustomAlbert(albert_base_configuration)
21
+
22
+ files = os.listdir(log_dir)
23
+ ckpts = []
24
+ for f in os.listdir(log_dir):
25
+ if f.startswith("step_"): ckpts.append(f)
26
+
27
+ iters = [int(f.split('_')[-1].split('.')[0]) for f in ckpts if os.path.isfile(os.path.join(log_dir, f))]
28
+ iters = sorted(iters)[-1]
29
+
30
+ checkpoint = torch.load(log_dir + "/step_" + str(iters) + ".t7", map_location='cpu')
31
+ state_dict = checkpoint['net']
32
+ from collections import OrderedDict
33
+ new_state_dict = OrderedDict()
34
+ for k, v in state_dict.items():
35
+ name = k[7:] # remove `module.`
36
+ if name.startswith('encoder.'):
37
+ name = name[8:] # remove `encoder.`
38
+ new_state_dict[name] = v
39
+ del new_state_dict["embeddings.position_ids"]
40
+ bert.load_state_dict(new_state_dict, strict=False)
41
+
42
+ return bert
bert.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f0264bc954462e498ae0affe1c4161b36e8029bbc76c869087c9b40917b95f8
3
+ size 25178740
bert_encoder.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:987897733e7e7b710f6833233801b8545f9677397725acc79ee5cae34debf7cd
3
+ size 1576502
checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8abbfa93106ef4950644e1de293e7fae3383911059f2ec110c641843b74bb420
3
+ size 2242747934
config.json ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_params": {
3
+ "decoder": {
4
+ "resblock_dilation_sizes": [
5
+ [
6
+ 1,
7
+ 3,
8
+ 5
9
+ ],
10
+ [
11
+ 1,
12
+ 3,
13
+ 5
14
+ ],
15
+ [
16
+ 1,
17
+ 3,
18
+ 5
19
+ ]
20
+ ],
21
+ "resblock_kernel_sizes": [
22
+ 3,
23
+ 7,
24
+ 11
25
+ ],
26
+ "type": "hifigan",
27
+ "upsample_initial_channel": 512,
28
+ "upsample_kernel_sizes": [
29
+ 20,
30
+ 10,
31
+ 6,
32
+ 4
33
+ ],
34
+ "upsample_rates": [
35
+ 10,
36
+ 5,
37
+ 3,
38
+ 2
39
+ ]
40
+ },
41
+ "diffusion": {
42
+ "dist": {
43
+ "estimate_sigma_data": true,
44
+ "mean": -3.0,
45
+ "sigma_data": 0.2,
46
+ "std": 1.0
47
+ },
48
+ "embedding_mask_proba": 0.1,
49
+ "transformer": {
50
+ "head_features": 64,
51
+ "multiplier": 2,
52
+ "num_heads": 8,
53
+ "num_layers": 3
54
+ }
55
+ },
56
+ "dim_in": 64,
57
+ "dropout": 0.2,
58
+ "hidden_dim": 512,
59
+ "max_conv_dim": 512,
60
+ "max_dur": 50,
61
+ "multispeaker": true,
62
+ "n_layer": 3,
63
+ "n_mels": 80,
64
+ "n_token": 178,
65
+ "slm": {
66
+ "hidden": 768,
67
+ "initial_channel": 64,
68
+ "model": "microsoft/wavlm-base-plus",
69
+ "nlayers": 13,
70
+ "sr": 16000
71
+ },
72
+ "style_dim": 128
73
+ },
74
+ "training_config": {
75
+ "epochs": 10,
76
+ "batch_size": 2,
77
+ "max_len": 630,
78
+ "optimizer": {
79
+ "bert_lr": 1e-05,
80
+ "ft_lr": 0.0001,
81
+ "lr": 0.0001
82
+ },
83
+ "loss_params": {
84
+ "diff_epoch": 2,
85
+ "joint_epoch": 110,
86
+ "lambda_F0": 1.0,
87
+ "lambda_ce": 20.0,
88
+ "lambda_diff": 1.0,
89
+ "lambda_dur": 1.0,
90
+ "lambda_gen": 1.0,
91
+ "lambda_mel": 5.0,
92
+ "lambda_mono": 1.0,
93
+ "lambda_norm": 1.0,
94
+ "lambda_s2s": 1.0,
95
+ "lambda_slm": 1.0,
96
+ "lambda_sty": 1.0
97
+ }
98
+ },
99
+ "preprocess_params": {
100
+ "spect_params": {
101
+ "hop_length": 300,
102
+ "n_fft": 2048,
103
+ "win_length": 1200
104
+ },
105
+ "sr": 24000
106
+ },
107
+ "data_params": {
108
+ "OOD_data": "Data/OOD_texts.txt",
109
+ "min_length": 50,
110
+ "root_path": "Data/wavs",
111
+ "train_data": "Data/train_list.txt",
112
+ "val_data": "Data/val_list.txt"
113
+ },
114
+ "model_state": {
115
+ "epoch": 9,
116
+ "iterations": 4080,
117
+ "val_loss": 0.40760722756385803
118
+ },
119
+ "training_metrics": {
120
+ "train_loss": [],
121
+ "val_loss": [
122
+ 18.0,
123
+ 30.0,
124
+ 30.0,
125
+ 44.0,
126
+ 33.0,
127
+ 21.0,
128
+ 17.0,
129
+ 44.0,
130
+ 45.0,
131
+ 55.0,
132
+ 29.0,
133
+ 8.0,
134
+ 45.0,
135
+ 29.0,
136
+ 11.0,
137
+ 52.0,
138
+ 48.0
139
+ ],
140
+ "dur_loss": [
141
+ 0.44,
142
+ 0.398,
143
+ 0.368,
144
+ 0.373,
145
+ 0.362,
146
+ 0.371,
147
+ 0.376,
148
+ 0.438,
149
+ 0.425,
150
+ 0.418,
151
+ 0.41,
152
+ 0.4,
153
+ 0.401,
154
+ 0.399,
155
+ 0.409,
156
+ 0.397,
157
+ 0.408
158
+ ],
159
+ "F0_loss": [
160
+ 2.865,
161
+ 2.754,
162
+ 2.769,
163
+ 2.916,
164
+ 2.745,
165
+ 2.811,
166
+ 2.793,
167
+ 2.665,
168
+ 2.761,
169
+ 2.683,
170
+ 2.783,
171
+ 2.524,
172
+ 2.615,
173
+ 2.515,
174
+ 2.663,
175
+ 2.66,
176
+ 2.477
177
+ ],
178
+ "epochs": [
179
+ 1,
180
+ 2,
181
+ 3,
182
+ 4,
183
+ 5,
184
+ 6,
185
+ 7,
186
+ 8,
187
+ 9,
188
+ 10,
189
+ 11,
190
+ 12,
191
+ 13,
192
+ 14,
193
+ 15,
194
+ 16,
195
+ 17
196
+ ]
197
+ }
198
+ }
config.yml ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ASR_config: Utils/ASR/config.yml
2
+ ASR_path: Utils/ASR/epoch_00080.pth
3
+ F0_path: Utils/JDC/bst.t7
4
+ PLBERT_dir: Utils/PLBERT/
5
+ model_params:
6
+ decoder:
7
+ resblock_dilation_sizes:
8
+ - - 1
9
+ - 3
10
+ - 5
11
+ - - 1
12
+ - 3
13
+ - 5
14
+ - - 1
15
+ - 3
16
+ - 5
17
+ resblock_kernel_sizes:
18
+ - 3
19
+ - 7
20
+ - 11
21
+ type: hifigan
22
+ upsample_initial_channel: 512
23
+ upsample_kernel_sizes:
24
+ - 20
25
+ - 10
26
+ - 6
27
+ - 4
28
+ upsample_rates:
29
+ - 10
30
+ - 5
31
+ - 3
32
+ - 2
33
+ diffusion:
34
+ dist:
35
+ estimate_sigma_data: true
36
+ mean: -3.0
37
+ sigma_data: 0.2
38
+ std: 1.0
39
+ embedding_mask_proba: 0.1
40
+ transformer:
41
+ head_features: 64
42
+ multiplier: 2
43
+ num_heads: 8
44
+ num_layers: 3
45
+ dim_in: 64
46
+ dropout: 0.2
47
+ hidden_dim: 512
48
+ max_conv_dim: 512
49
+ max_dur: 50
50
+ multispeaker: true
51
+ n_layer: 3
52
+ n_mels: 80
53
+ n_token: 178
54
+ slm:
55
+ hidden: 768
56
+ initial_channel: 64
57
+ model: microsoft/wavlm-base-plus
58
+ nlayers: 13
59
+ sr: 16000
60
+ style_dim: 128
61
+ preprocess_params:
62
+ spect_params:
63
+ hop_length: 300
64
+ n_fft: 2048
65
+ win_length: 1200
66
+ sr: 24000
decoder.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93e8cad16f38b082ebd13c2f75c41ee8165fd4533241ca8f5df61368c0a3587c
3
+ size 217409318
diffusion.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e51e3b27078fa10c2fc80c502567edaf2d8355aa287683b4d4915d7a2c4f3913
3
+ size 101337326
models.py ADDED
@@ -0,0 +1,713 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+
3
+ import os
4
+ import os.path as osp
5
+
6
+ import copy
7
+ import math
8
+
9
+ import numpy as np
10
+ import torch
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
14
+
15
+ from Utils.ASR.models import ASRCNN
16
+ from Utils.JDC.model import JDCNet
17
+
18
+ from Modules.diffusion.sampler import KDiffusion, LogNormalDistribution
19
+ from Modules.diffusion.modules import Transformer1d, StyleTransformer1d
20
+ from Modules.diffusion.diffusion import AudioDiffusionConditional
21
+
22
+ from Modules.discriminators import MultiPeriodDiscriminator, MultiResSpecDiscriminator, WavLMDiscriminator
23
+
24
+ from munch import Munch
25
+ import yaml
26
+
27
+ class LearnedDownSample(nn.Module):
28
+ def __init__(self, layer_type, dim_in):
29
+ super().__init__()
30
+ self.layer_type = layer_type
31
+
32
+ if self.layer_type == 'none':
33
+ self.conv = nn.Identity()
34
+ elif self.layer_type == 'timepreserve':
35
+ self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0)))
36
+ elif self.layer_type == 'half':
37
+ self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1))
38
+ else:
39
+ raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
40
+
41
+ def forward(self, x):
42
+ return self.conv(x)
43
+
44
+ class LearnedUpSample(nn.Module):
45
+ def __init__(self, layer_type, dim_in):
46
+ super().__init__()
47
+ self.layer_type = layer_type
48
+
49
+ if self.layer_type == 'none':
50
+ self.conv = nn.Identity()
51
+ elif self.layer_type == 'timepreserve':
52
+ self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0))
53
+ elif self.layer_type == 'half':
54
+ self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1)
55
+ else:
56
+ raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
57
+
58
+
59
+ def forward(self, x):
60
+ return self.conv(x)
61
+
62
+ class DownSample(nn.Module):
63
+ def __init__(self, layer_type):
64
+ super().__init__()
65
+ self.layer_type = layer_type
66
+
67
+ def forward(self, x):
68
+ if self.layer_type == 'none':
69
+ return x
70
+ elif self.layer_type == 'timepreserve':
71
+ return F.avg_pool2d(x, (2, 1))
72
+ elif self.layer_type == 'half':
73
+ if x.shape[-1] % 2 != 0:
74
+ x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
75
+ return F.avg_pool2d(x, 2)
76
+ else:
77
+ raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
78
+
79
+
80
+ class UpSample(nn.Module):
81
+ def __init__(self, layer_type):
82
+ super().__init__()
83
+ self.layer_type = layer_type
84
+
85
+ def forward(self, x):
86
+ if self.layer_type == 'none':
87
+ return x
88
+ elif self.layer_type == 'timepreserve':
89
+ return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
90
+ elif self.layer_type == 'half':
91
+ return F.interpolate(x, scale_factor=2, mode='nearest')
92
+ else:
93
+ raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
94
+
95
+
96
+ class ResBlk(nn.Module):
97
+ def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
98
+ normalize=False, downsample='none'):
99
+ super().__init__()
100
+ self.actv = actv
101
+ self.normalize = normalize
102
+ self.downsample = DownSample(downsample)
103
+ self.downsample_res = LearnedDownSample(downsample, dim_in)
104
+ self.learned_sc = dim_in != dim_out
105
+ self._build_weights(dim_in, dim_out)
106
+
107
+ def _build_weights(self, dim_in, dim_out):
108
+ self.conv1 = spectral_norm(nn.Conv2d(dim_in, dim_in, 3, 1, 1))
109
+ self.conv2 = spectral_norm(nn.Conv2d(dim_in, dim_out, 3, 1, 1))
110
+ if self.normalize:
111
+ self.norm1 = nn.InstanceNorm2d(dim_in, affine=True)
112
+ self.norm2 = nn.InstanceNorm2d(dim_in, affine=True)
113
+ if self.learned_sc:
114
+ self.conv1x1 = spectral_norm(nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False))
115
+
116
+ def _shortcut(self, x):
117
+ if self.learned_sc:
118
+ x = self.conv1x1(x)
119
+ if self.downsample:
120
+ x = self.downsample(x)
121
+ return x
122
+
123
+ def _residual(self, x):
124
+ if self.normalize:
125
+ x = self.norm1(x)
126
+ x = self.actv(x)
127
+ x = self.conv1(x)
128
+ x = self.downsample_res(x)
129
+ if self.normalize:
130
+ x = self.norm2(x)
131
+ x = self.actv(x)
132
+ x = self.conv2(x)
133
+ return x
134
+
135
+ def forward(self, x):
136
+ x = self._shortcut(x) + self._residual(x)
137
+ return x / math.sqrt(2) # unit variance
138
+
139
+ class StyleEncoder(nn.Module):
140
+ def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
141
+ super().__init__()
142
+ blocks = []
143
+ blocks += [spectral_norm(nn.Conv2d(1, dim_in, 3, 1, 1))]
144
+
145
+ repeat_num = 4
146
+ for _ in range(repeat_num):
147
+ dim_out = min(dim_in*2, max_conv_dim)
148
+ blocks += [ResBlk(dim_in, dim_out, downsample='half')]
149
+ dim_in = dim_out
150
+
151
+ blocks += [nn.LeakyReLU(0.2)]
152
+ blocks += [spectral_norm(nn.Conv2d(dim_out, dim_out, 5, 1, 0))]
153
+ blocks += [nn.AdaptiveAvgPool2d(1)]
154
+ blocks += [nn.LeakyReLU(0.2)]
155
+ self.shared = nn.Sequential(*blocks)
156
+
157
+ self.unshared = nn.Linear(dim_out, style_dim)
158
+
159
+ def forward(self, x):
160
+ h = self.shared(x)
161
+ h = h.view(h.size(0), -1)
162
+ s = self.unshared(h)
163
+
164
+ return s
165
+
166
+ class LinearNorm(torch.nn.Module):
167
+ def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
168
+ super(LinearNorm, self).__init__()
169
+ self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
170
+
171
+ torch.nn.init.xavier_uniform_(
172
+ self.linear_layer.weight,
173
+ gain=torch.nn.init.calculate_gain(w_init_gain))
174
+
175
+ def forward(self, x):
176
+ return self.linear_layer(x)
177
+
178
+ class Discriminator2d(nn.Module):
179
+ def __init__(self, dim_in=48, num_domains=1, max_conv_dim=384, repeat_num=4):
180
+ super().__init__()
181
+ blocks = []
182
+ blocks += [spectral_norm(nn.Conv2d(1, dim_in, 3, 1, 1))]
183
+
184
+ for lid in range(repeat_num):
185
+ dim_out = min(dim_in*2, max_conv_dim)
186
+ blocks += [ResBlk(dim_in, dim_out, downsample='half')]
187
+ dim_in = dim_out
188
+
189
+ blocks += [nn.LeakyReLU(0.2)]
190
+ blocks += [spectral_norm(nn.Conv2d(dim_out, dim_out, 5, 1, 0))]
191
+ blocks += [nn.LeakyReLU(0.2)]
192
+ blocks += [nn.AdaptiveAvgPool2d(1)]
193
+ blocks += [spectral_norm(nn.Conv2d(dim_out, num_domains, 1, 1, 0))]
194
+ self.main = nn.Sequential(*blocks)
195
+
196
+ def get_feature(self, x):
197
+ features = []
198
+ for l in self.main:
199
+ x = l(x)
200
+ features.append(x)
201
+ out = features[-1]
202
+ out = out.view(out.size(0), -1) # (batch, num_domains)
203
+ return out, features
204
+
205
+ def forward(self, x):
206
+ out, features = self.get_feature(x)
207
+ out = out.squeeze() # (batch)
208
+ return out, features
209
+
210
+ class ResBlk1d(nn.Module):
211
+ def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
212
+ normalize=False, downsample='none', dropout_p=0.2):
213
+ super().__init__()
214
+ self.actv = actv
215
+ self.normalize = normalize
216
+ self.downsample_type = downsample
217
+ self.learned_sc = dim_in != dim_out
218
+ self._build_weights(dim_in, dim_out)
219
+ self.dropout_p = dropout_p
220
+
221
+ if self.downsample_type == 'none':
222
+ self.pool = nn.Identity()
223
+ else:
224
+ self.pool = weight_norm(nn.Conv1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1))
225
+
226
+ def _build_weights(self, dim_in, dim_out):
227
+ self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_in, 3, 1, 1))
228
+ self.conv2 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
229
+ if self.normalize:
230
+ self.norm1 = nn.InstanceNorm1d(dim_in, affine=True)
231
+ self.norm2 = nn.InstanceNorm1d(dim_in, affine=True)
232
+ if self.learned_sc:
233
+ self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
234
+
235
+ def downsample(self, x):
236
+ if self.downsample_type == 'none':
237
+ return x
238
+ else:
239
+ if x.shape[-1] % 2 != 0:
240
+ x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
241
+ return F.avg_pool1d(x, 2)
242
+
243
+ def _shortcut(self, x):
244
+ if self.learned_sc:
245
+ x = self.conv1x1(x)
246
+ x = self.downsample(x)
247
+ return x
248
+
249
+ def _residual(self, x):
250
+ if self.normalize:
251
+ x = self.norm1(x)
252
+ x = self.actv(x)
253
+ x = F.dropout(x, p=self.dropout_p, training=self.training)
254
+
255
+ x = self.conv1(x)
256
+ x = self.pool(x)
257
+ if self.normalize:
258
+ x = self.norm2(x)
259
+
260
+ x = self.actv(x)
261
+ x = F.dropout(x, p=self.dropout_p, training=self.training)
262
+
263
+ x = self.conv2(x)
264
+ return x
265
+
266
+ def forward(self, x):
267
+ x = self._shortcut(x) + self._residual(x)
268
+ return x / math.sqrt(2) # unit variance
269
+
270
+ class LayerNorm(nn.Module):
271
+ def __init__(self, channels, eps=1e-5):
272
+ super().__init__()
273
+ self.channels = channels
274
+ self.eps = eps
275
+
276
+ self.gamma = nn.Parameter(torch.ones(channels))
277
+ self.beta = nn.Parameter(torch.zeros(channels))
278
+
279
+ def forward(self, x):
280
+ x = x.transpose(1, -1)
281
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
282
+ return x.transpose(1, -1)
283
+
284
+ class TextEncoder(nn.Module):
285
+ def __init__(self, channels, kernel_size, depth, n_symbols, actv=nn.LeakyReLU(0.2)):
286
+ super().__init__()
287
+ self.embedding = nn.Embedding(n_symbols, channels)
288
+
289
+ padding = (kernel_size - 1) // 2
290
+ self.cnn = nn.ModuleList()
291
+ for _ in range(depth):
292
+ self.cnn.append(nn.Sequential(
293
+ weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)),
294
+ LayerNorm(channels),
295
+ actv,
296
+ nn.Dropout(0.2),
297
+ ))
298
+ # self.cnn = nn.Sequential(*self.cnn)
299
+
300
+ self.lstm = nn.LSTM(channels, channels//2, 1, batch_first=True, bidirectional=True)
301
+
302
+ def forward(self, x, input_lengths, m):
303
+ x = self.embedding(x) # [B, T, emb]
304
+ x = x.transpose(1, 2) # [B, emb, T]
305
+ m = m.to(input_lengths.device).unsqueeze(1)
306
+ x.masked_fill_(m, 0.0)
307
+
308
+ for c in self.cnn:
309
+ x = c(x)
310
+ x.masked_fill_(m, 0.0)
311
+
312
+ x = x.transpose(1, 2) # [B, T, chn]
313
+
314
+ input_lengths = input_lengths.cpu().numpy()
315
+ x = nn.utils.rnn.pack_padded_sequence(
316
+ x, input_lengths, batch_first=True, enforce_sorted=False)
317
+
318
+ self.lstm.flatten_parameters()
319
+ x, _ = self.lstm(x)
320
+ x, _ = nn.utils.rnn.pad_packed_sequence(
321
+ x, batch_first=True)
322
+
323
+ x = x.transpose(-1, -2)
324
+ x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
325
+
326
+ x_pad[:, :, :x.shape[-1]] = x
327
+ x = x_pad.to(x.device)
328
+
329
+ x.masked_fill_(m, 0.0)
330
+
331
+ return x
332
+
333
+ def inference(self, x):
334
+ x = self.embedding(x)
335
+ x = x.transpose(1, 2)
336
+ x = self.cnn(x)
337
+ x = x.transpose(1, 2)
338
+ self.lstm.flatten_parameters()
339
+ x, _ = self.lstm(x)
340
+ return x
341
+
342
+ def length_to_mask(self, lengths):
343
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
344
+ mask = torch.gt(mask+1, lengths.unsqueeze(1))
345
+ return mask
346
+
347
+
348
+
349
+ class AdaIN1d(nn.Module):
350
+ def __init__(self, style_dim, num_features):
351
+ super().__init__()
352
+ self.norm = nn.InstanceNorm1d(num_features, affine=False)
353
+ self.fc = nn.Linear(style_dim, num_features*2)
354
+
355
+ def forward(self, x, s):
356
+ h = self.fc(s)
357
+ h = h.view(h.size(0), h.size(1), 1)
358
+ gamma, beta = torch.chunk(h, chunks=2, dim=1)
359
+ return (1 + gamma) * self.norm(x) + beta
360
+
361
+ class UpSample1d(nn.Module):
362
+ def __init__(self, layer_type):
363
+ super().__init__()
364
+ self.layer_type = layer_type
365
+
366
+ def forward(self, x):
367
+ if self.layer_type == 'none':
368
+ return x
369
+ else:
370
+ return F.interpolate(x, scale_factor=2, mode='nearest')
371
+
372
+ class AdainResBlk1d(nn.Module):
373
+ def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
374
+ upsample='none', dropout_p=0.0):
375
+ super().__init__()
376
+ self.actv = actv
377
+ self.upsample_type = upsample
378
+ self.upsample = UpSample1d(upsample)
379
+ self.learned_sc = dim_in != dim_out
380
+ self._build_weights(dim_in, dim_out, style_dim)
381
+ self.dropout = nn.Dropout(dropout_p)
382
+
383
+ if upsample == 'none':
384
+ self.pool = nn.Identity()
385
+ else:
386
+ self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
387
+
388
+
389
+ def _build_weights(self, dim_in, dim_out, style_dim):
390
+ self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
391
+ self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
392
+ self.norm1 = AdaIN1d(style_dim, dim_in)
393
+ self.norm2 = AdaIN1d(style_dim, dim_out)
394
+ if self.learned_sc:
395
+ self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
396
+
397
+ def _shortcut(self, x):
398
+ x = self.upsample(x)
399
+ if self.learned_sc:
400
+ x = self.conv1x1(x)
401
+ return x
402
+
403
+ def _residual(self, x, s):
404
+ x = self.norm1(x, s)
405
+ x = self.actv(x)
406
+ x = self.pool(x)
407
+ x = self.conv1(self.dropout(x))
408
+ x = self.norm2(x, s)
409
+ x = self.actv(x)
410
+ x = self.conv2(self.dropout(x))
411
+ return x
412
+
413
+ def forward(self, x, s):
414
+ out = self._residual(x, s)
415
+ out = (out + self._shortcut(x)) / math.sqrt(2)
416
+ return out
417
+
418
+ class AdaLayerNorm(nn.Module):
419
+ def __init__(self, style_dim, channels, eps=1e-5):
420
+ super().__init__()
421
+ self.channels = channels
422
+ self.eps = eps
423
+
424
+ self.fc = nn.Linear(style_dim, channels*2)
425
+
426
+ def forward(self, x, s):
427
+ x = x.transpose(-1, -2)
428
+ x = x.transpose(1, -1)
429
+
430
+ h = self.fc(s)
431
+ h = h.view(h.size(0), h.size(1), 1)
432
+ gamma, beta = torch.chunk(h, chunks=2, dim=1)
433
+ gamma, beta = gamma.transpose(1, -1), beta.transpose(1, -1)
434
+
435
+
436
+ x = F.layer_norm(x, (self.channels,), eps=self.eps)
437
+ x = (1 + gamma) * x + beta
438
+ return x.transpose(1, -1).transpose(-1, -2)
439
+
440
+ class ProsodyPredictor(nn.Module):
441
+
442
+ def __init__(self, style_dim, d_hid, nlayers, max_dur=50, dropout=0.1):
443
+ super().__init__()
444
+
445
+ self.text_encoder = DurationEncoder(sty_dim=style_dim,
446
+ d_model=d_hid,
447
+ nlayers=nlayers,
448
+ dropout=dropout)
449
+
450
+ self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
451
+ self.duration_proj = LinearNorm(d_hid, max_dur)
452
+
453
+ self.shared = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
454
+ self.F0 = nn.ModuleList()
455
+ self.F0.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
456
+ self.F0.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
457
+ self.F0.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
458
+
459
+ self.N = nn.ModuleList()
460
+ self.N.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
461
+ self.N.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
462
+ self.N.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
463
+
464
+ self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
465
+ self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
466
+
467
+
468
+ def forward(self, texts, style, text_lengths, alignment, m):
469
+ d = self.text_encoder(texts, style, text_lengths, m)
470
+
471
+ batch_size = d.shape[0]
472
+ text_size = d.shape[1]
473
+
474
+ # predict duration
475
+ input_lengths = text_lengths.cpu().numpy()
476
+ x = nn.utils.rnn.pack_padded_sequence(
477
+ d, input_lengths, batch_first=True, enforce_sorted=False)
478
+
479
+ m = m.to(text_lengths.device).unsqueeze(1)
480
+
481
+ self.lstm.flatten_parameters()
482
+ x, _ = self.lstm(x)
483
+ x, _ = nn.utils.rnn.pad_packed_sequence(
484
+ x, batch_first=True)
485
+
486
+ x_pad = torch.zeros([x.shape[0], m.shape[-1], x.shape[-1]])
487
+
488
+ x_pad[:, :x.shape[1], :] = x
489
+ x = x_pad.to(x.device)
490
+
491
+ duration = self.duration_proj(nn.functional.dropout(x, 0.5, training=self.training))
492
+
493
+ en = (d.transpose(-1, -2) @ alignment)
494
+
495
+ return duration.squeeze(-1), en
496
+
497
+ def F0Ntrain(self, x, s):
498
+ x, _ = self.shared(x.transpose(-1, -2))
499
+
500
+ F0 = x.transpose(-1, -2)
501
+ for block in self.F0:
502
+ F0 = block(F0, s)
503
+ F0 = self.F0_proj(F0)
504
+
505
+ N = x.transpose(-1, -2)
506
+ for block in self.N:
507
+ N = block(N, s)
508
+ N = self.N_proj(N)
509
+
510
+ return F0.squeeze(1), N.squeeze(1)
511
+
512
+ def length_to_mask(self, lengths):
513
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
514
+ mask = torch.gt(mask+1, lengths.unsqueeze(1))
515
+ return mask
516
+
517
+ class DurationEncoder(nn.Module):
518
+
519
+ def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):
520
+ super().__init__()
521
+ self.lstms = nn.ModuleList()
522
+ for _ in range(nlayers):
523
+ self.lstms.append(nn.LSTM(d_model + sty_dim,
524
+ d_model // 2,
525
+ num_layers=1,
526
+ batch_first=True,
527
+ bidirectional=True,
528
+ dropout=dropout))
529
+ self.lstms.append(AdaLayerNorm(sty_dim, d_model))
530
+
531
+
532
+ self.dropout = dropout
533
+ self.d_model = d_model
534
+ self.sty_dim = sty_dim
535
+
536
+ def forward(self, x, style, text_lengths, m):
537
+ masks = m.to(text_lengths.device)
538
+
539
+ x = x.permute(2, 0, 1)
540
+ s = style.expand(x.shape[0], x.shape[1], -1)
541
+ x = torch.cat([x, s], axis=-1)
542
+ x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
543
+
544
+ x = x.transpose(0, 1)
545
+ input_lengths = text_lengths.cpu().numpy()
546
+ x = x.transpose(-1, -2)
547
+
548
+ for block in self.lstms:
549
+ if isinstance(block, AdaLayerNorm):
550
+ x = block(x.transpose(-1, -2), style).transpose(-1, -2)
551
+ x = torch.cat([x, s.permute(1, -1, 0)], axis=1)
552
+ x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
553
+ else:
554
+ x = x.transpose(-1, -2)
555
+ x = nn.utils.rnn.pack_padded_sequence(
556
+ x, input_lengths, batch_first=True, enforce_sorted=False)
557
+ block.flatten_parameters()
558
+ x, _ = block(x)
559
+ x, _ = nn.utils.rnn.pad_packed_sequence(
560
+ x, batch_first=True)
561
+ x = F.dropout(x, p=self.dropout, training=self.training)
562
+ x = x.transpose(-1, -2)
563
+
564
+ x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
565
+
566
+ x_pad[:, :, :x.shape[-1]] = x
567
+ x = x_pad.to(x.device)
568
+
569
+ return x.transpose(-1, -2)
570
+
571
+ def inference(self, x, style):
572
+ x = self.embedding(x.transpose(-1, -2)) * math.sqrt(self.d_model)
573
+ style = style.expand(x.shape[0], x.shape[1], -1)
574
+ x = torch.cat([x, style], axis=-1)
575
+ src = self.pos_encoder(x)
576
+ output = self.transformer_encoder(src).transpose(0, 1)
577
+ return output
578
+
579
+ def length_to_mask(self, lengths):
580
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
581
+ mask = torch.gt(mask+1, lengths.unsqueeze(1))
582
+ return mask
583
+
584
+ def load_F0_models(path):
585
+ # load F0 model
586
+
587
+ F0_model = JDCNet(num_class=1, seq_len=192)
588
+ params = torch.load(path, map_location='cpu')['net']
589
+ F0_model.load_state_dict(params)
590
+ _ = F0_model.train()
591
+
592
+ return F0_model
593
+
594
+ def load_ASR_models(ASR_MODEL_PATH, ASR_MODEL_CONFIG):
595
+ # load ASR model
596
+ def _load_config(path):
597
+ with open(path) as f:
598
+ config = yaml.safe_load(f)
599
+ model_config = config['model_params']
600
+ return model_config
601
+
602
+ def _load_model(model_config, model_path):
603
+ model = ASRCNN(**model_config)
604
+ params = torch.load(model_path, map_location='cpu')['model']
605
+ model.load_state_dict(params)
606
+ return model
607
+
608
+ asr_model_config = _load_config(ASR_MODEL_CONFIG)
609
+ asr_model = _load_model(asr_model_config, ASR_MODEL_PATH)
610
+ _ = asr_model.train()
611
+
612
+ return asr_model
613
+
614
+ def build_model(args, text_aligner, pitch_extractor, bert):
615
+ assert args.decoder.type in ['istftnet', 'hifigan'], 'Decoder type unknown'
616
+
617
+ if args.decoder.type == "istftnet":
618
+ from Modules.istftnet import Decoder
619
+ decoder = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels,
620
+ resblock_kernel_sizes = args.decoder.resblock_kernel_sizes,
621
+ upsample_rates = args.decoder.upsample_rates,
622
+ upsample_initial_channel=args.decoder.upsample_initial_channel,
623
+ resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
624
+ upsample_kernel_sizes=args.decoder.upsample_kernel_sizes,
625
+ gen_istft_n_fft=args.decoder.gen_istft_n_fft, gen_istft_hop_size=args.decoder.gen_istft_hop_size)
626
+ else:
627
+ from Modules.hifigan import Decoder
628
+ decoder = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels,
629
+ resblock_kernel_sizes = args.decoder.resblock_kernel_sizes,
630
+ upsample_rates = args.decoder.upsample_rates,
631
+ upsample_initial_channel=args.decoder.upsample_initial_channel,
632
+ resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
633
+ upsample_kernel_sizes=args.decoder.upsample_kernel_sizes)
634
+
635
+ text_encoder = TextEncoder(channels=args.hidden_dim, kernel_size=5, depth=args.n_layer, n_symbols=args.n_token)
636
+
637
+ predictor = ProsodyPredictor(style_dim=args.style_dim, d_hid=args.hidden_dim, nlayers=args.n_layer, max_dur=args.max_dur, dropout=args.dropout)
638
+
639
+ style_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim) # acoustic style encoder
640
+ predictor_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim) # prosodic style encoder
641
+
642
+ # define diffusion model
643
+ if args.multispeaker:
644
+ transformer = StyleTransformer1d(channels=args.style_dim*2,
645
+ context_embedding_features=bert.config.hidden_size,
646
+ context_features=args.style_dim*2,
647
+ **args.diffusion.transformer)
648
+ else:
649
+ transformer = Transformer1d(channels=args.style_dim*2,
650
+ context_embedding_features=bert.config.hidden_size,
651
+ **args.diffusion.transformer)
652
+
653
+ diffusion = AudioDiffusionConditional(
654
+ in_channels=1,
655
+ embedding_max_length=bert.config.max_position_embeddings,
656
+ embedding_features=bert.config.hidden_size,
657
+ embedding_mask_proba=args.diffusion.embedding_mask_proba, # Conditional dropout of batch elements,
658
+ channels=args.style_dim*2,
659
+ context_features=args.style_dim*2,
660
+ )
661
+
662
+ diffusion.diffusion = KDiffusion(
663
+ net=diffusion.unet,
664
+ sigma_distribution=LogNormalDistribution(mean = args.diffusion.dist.mean, std = args.diffusion.dist.std),
665
+ sigma_data=args.diffusion.dist.sigma_data, # a placeholder, will be changed dynamically when start training diffusion model
666
+ dynamic_threshold=0.0
667
+ )
668
+ diffusion.diffusion.net = transformer
669
+ diffusion.unet = transformer
670
+
671
+
672
+ nets = Munch(
673
+ bert=bert,
674
+ bert_encoder=nn.Linear(bert.config.hidden_size, args.hidden_dim),
675
+
676
+ predictor=predictor,
677
+ decoder=decoder,
678
+ text_encoder=text_encoder,
679
+
680
+ predictor_encoder=predictor_encoder,
681
+ style_encoder=style_encoder,
682
+ diffusion=diffusion,
683
+
684
+ text_aligner = text_aligner,
685
+ pitch_extractor=pitch_extractor,
686
+
687
+ mpd = MultiPeriodDiscriminator(),
688
+ msd = MultiResSpecDiscriminator(),
689
+
690
+ # slm discriminator head
691
+ wd = WavLMDiscriminator(args.slm.hidden, args.slm.nlayers, args.slm.initial_channel),
692
+ )
693
+
694
+ return nets
695
+
696
+ def load_checkpoint(model, optimizer, path, load_only_params=True, ignore_modules=[]):
697
+ state = torch.load(path, map_location='cpu')
698
+ params = state['net']
699
+ for key in model:
700
+ if key in params and key not in ignore_modules:
701
+ print('%s loaded' % key)
702
+ model[key].load_state_dict(params[key], strict=False)
703
+ _ = [model[key].eval() for key in model]
704
+
705
+ if not load_only_params:
706
+ epoch = state["epoch"]
707
+ iters = state["iters"]
708
+ optimizer.load_state_dict(state["optimizer"])
709
+ else:
710
+ epoch = 0
711
+ iters = 0
712
+
713
+ return model, optimizer, epoch, iters
mpd.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a140e610e83ccd6e9407be6b13f9ddff5121525ff3af9319713adb8d6861cc17
3
+ size 164447824
msd.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bd53c1bb0b77a649e2208095f2828d71ed57ac4b9bdbebebc93f4b2cc5550ec
3
+ size 1139020
pitch_extractor.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a15c28725403a9d5a479cbcb9f75a0cc62cd1dc32c0248d06e13adb8b7049b2
3
+ size 21028913
predictor.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37bf44866e70d0c11616661ce4a02c1a8d172f1c222cf246684ba1c4ad57f6df
3
+ size 64813639
predictor_encoder.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac439c809ecffef6cf8d130be1888781916358489101f9d1b0f967ad3f5337e2
3
+ size 55547155
style_encoder.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:478c67e53980c87515d613b9afd79055584930cf00c56266b938a6b432d16902
3
+ size 55546871
text_aligner.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dad7dc341e2e7b8095f29aa7513f01ddec5d1be410f09fbd3adb6b41b2f27b52
3
+ size 31531315
text_encoder.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d611064414bd8a986460aad4f93666b5f7fa98aa131de2bdbe3dd27ee430332
3
+ size 22432460
text_utils.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # IPA Phonemizer: https://github.com/bootphon/phonemizer
2
+
3
+ _pad = "$"
4
+ _punctuation = ';:,.!?¡¿—…"«»“” '
5
+ _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
6
+ _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
7
+
8
+ # Export all symbols:
9
+ symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
10
+
11
+ dicts = {}
12
+ for i in range(len((symbols))):
13
+ dicts[symbols[i]] = i
14
+
15
+ class TextCleaner:
16
+ def __init__(self, dummy=None):
17
+ self.word_index_dictionary = dicts
18
+ print(len(dicts))
19
+ def __call__(self, text):
20
+ indexes = []
21
+ for char in text:
22
+ try:
23
+ indexes.append(self.word_index_dictionary[char])
24
+ except KeyError:
25
+ print(text)
26
+ return indexes
training_metrics.png ADDED
utils.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from monotonic_align import maximum_path
2
+ from monotonic_align import mask_from_lens
3
+ from monotonic_align.core import maximum_path_c
4
+ import numpy as np
5
+ import torch
6
+ import copy
7
+ from torch import nn
8
+ import torch.nn.functional as F
9
+ import torchaudio
10
+ import librosa
11
+ import matplotlib.pyplot as plt
12
+ from munch import Munch
13
+
14
+ def maximum_path(neg_cent, mask):
15
+ """ Cython optimized version.
16
+ neg_cent: [b, t_t, t_s]
17
+ mask: [b, t_t, t_s]
18
+ """
19
+ device = neg_cent.device
20
+ dtype = neg_cent.dtype
21
+ neg_cent = np.ascontiguousarray(neg_cent.data.cpu().numpy().astype(np.float32))
22
+ path = np.ascontiguousarray(np.zeros(neg_cent.shape, dtype=np.int32))
23
+
24
+ t_t_max = np.ascontiguousarray(mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32))
25
+ t_s_max = np.ascontiguousarray(mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32))
26
+ maximum_path_c(path, neg_cent, t_t_max, t_s_max)
27
+ return torch.from_numpy(path).to(device=device, dtype=dtype)
28
+
29
+ def get_data_path_list(train_path=None, val_path=None):
30
+ if train_path is None:
31
+ train_path = "Data/train_list.txt"
32
+ if val_path is None:
33
+ val_path = "Data/val_list.txt"
34
+
35
+ with open(train_path, 'r', encoding='utf-8', errors='ignore') as f:
36
+ train_list = f.readlines()
37
+ with open(val_path, 'r', encoding='utf-8', errors='ignore') as f:
38
+ val_list = f.readlines()
39
+
40
+ return train_list, val_list
41
+
42
+ def length_to_mask(lengths):
43
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
44
+ mask = torch.gt(mask+1, lengths.unsqueeze(1))
45
+ return mask
46
+
47
+ # for norm consistency loss
48
+ def log_norm(x, mean=-4, std=4, dim=2):
49
+ """
50
+ normalized log mel -> mel -> norm -> log(norm)
51
+ """
52
+ x = torch.log(torch.exp(x * std + mean).norm(dim=dim))
53
+ return x
54
+
55
+ def get_image(arrs):
56
+ plt.switch_backend('agg')
57
+ fig = plt.figure()
58
+ ax = plt.gca()
59
+ ax.imshow(arrs)
60
+
61
+ return fig
62
+
63
+ def recursive_munch(d):
64
+ if isinstance(d, dict):
65
+ return Munch((k, recursive_munch(v)) for k, v in d.items())
66
+ elif isinstance(d, list):
67
+ return [recursive_munch(v) for v in d]
68
+ else:
69
+ return d
70
+
71
+ def log_print(message, logger):
72
+ logger.info(message)
73
+ print(message)
74
+
wd.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0064fbf02b28a73a1dbae037c63077bc38c661362cfd08402b301606f153dde
3
+ size 4698570