Kororinpa commited on
Commit
0338194
1 Parent(s): 3d9d64b

Upload attentions.py

Browse files
Files changed (1) hide show
  1. attentions.py +303 -0
attentions.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import numpy as np
4
+ import torch
5
+ from torch import nn
6
+ from torch.nn import functional as F
7
+
8
+ import commons
9
+ import modules
10
+ from modules import LayerNorm
11
+
12
+
13
+ class Encoder(nn.Module):
14
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
15
+ super().__init__()
16
+ self.hidden_channels = hidden_channels
17
+ self.filter_channels = filter_channels
18
+ self.n_heads = n_heads
19
+ self.n_layers = n_layers
20
+ self.kernel_size = kernel_size
21
+ self.p_dropout = p_dropout
22
+ self.window_size = window_size
23
+
24
+ self.drop = nn.Dropout(p_dropout)
25
+ self.attn_layers = nn.ModuleList()
26
+ self.norm_layers_1 = nn.ModuleList()
27
+ self.ffn_layers = nn.ModuleList()
28
+ self.norm_layers_2 = nn.ModuleList()
29
+ for i in range(self.n_layers):
30
+ self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
31
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
32
+ self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
33
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
34
+
35
+ def forward(self, x, x_mask):
36
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
37
+ x = x * x_mask
38
+ for i in range(self.n_layers):
39
+ y = self.attn_layers[i](x, x, attn_mask)
40
+ y = self.drop(y)
41
+ x = self.norm_layers_1[i](x + y)
42
+
43
+ y = self.ffn_layers[i](x, x_mask)
44
+ y = self.drop(y)
45
+ x = self.norm_layers_2[i](x + y)
46
+ x = x * x_mask
47
+ return x
48
+
49
+
50
+ class Decoder(nn.Module):
51
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
52
+ super().__init__()
53
+ self.hidden_channels = hidden_channels
54
+ self.filter_channels = filter_channels
55
+ self.n_heads = n_heads
56
+ self.n_layers = n_layers
57
+ self.kernel_size = kernel_size
58
+ self.p_dropout = p_dropout
59
+ self.proximal_bias = proximal_bias
60
+ self.proximal_init = proximal_init
61
+
62
+ self.drop = nn.Dropout(p_dropout)
63
+ self.self_attn_layers = nn.ModuleList()
64
+ self.norm_layers_0 = nn.ModuleList()
65
+ self.encdec_attn_layers = nn.ModuleList()
66
+ self.norm_layers_1 = nn.ModuleList()
67
+ self.ffn_layers = nn.ModuleList()
68
+ self.norm_layers_2 = nn.ModuleList()
69
+ for i in range(self.n_layers):
70
+ self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
71
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
72
+ self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
73
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
74
+ self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
75
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
76
+
77
+ def forward(self, x, x_mask, h, h_mask):
78
+ """
79
+ x: decoder input
80
+ h: encoder output
81
+ """
82
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
83
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
84
+ x = x * x_mask
85
+ for i in range(self.n_layers):
86
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
87
+ y = self.drop(y)
88
+ x = self.norm_layers_0[i](x + y)
89
+
90
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
91
+ y = self.drop(y)
92
+ x = self.norm_layers_1[i](x + y)
93
+
94
+ y = self.ffn_layers[i](x, x_mask)
95
+ y = self.drop(y)
96
+ x = self.norm_layers_2[i](x + y)
97
+ x = x * x_mask
98
+ return x
99
+
100
+
101
+ class MultiHeadAttention(nn.Module):
102
+ def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
103
+ super().__init__()
104
+ assert channels % n_heads == 0
105
+
106
+ self.channels = channels
107
+ self.out_channels = out_channels
108
+ self.n_heads = n_heads
109
+ self.p_dropout = p_dropout
110
+ self.window_size = window_size
111
+ self.heads_share = heads_share
112
+ self.block_length = block_length
113
+ self.proximal_bias = proximal_bias
114
+ self.proximal_init = proximal_init
115
+ self.attn = None
116
+
117
+ self.k_channels = channels // n_heads
118
+ self.conv_q = nn.Conv1d(channels, channels, 1)
119
+ self.conv_k = nn.Conv1d(channels, channels, 1)
120
+ self.conv_v = nn.Conv1d(channels, channels, 1)
121
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
122
+ self.drop = nn.Dropout(p_dropout)
123
+
124
+ if window_size is not None:
125
+ n_heads_rel = 1 if heads_share else n_heads
126
+ rel_stddev = self.k_channels**-0.5
127
+ self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
128
+ self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
129
+
130
+ nn.init.xavier_uniform_(self.conv_q.weight)
131
+ nn.init.xavier_uniform_(self.conv_k.weight)
132
+ nn.init.xavier_uniform_(self.conv_v.weight)
133
+ if proximal_init:
134
+ with torch.no_grad():
135
+ self.conv_k.weight.copy_(self.conv_q.weight)
136
+ self.conv_k.bias.copy_(self.conv_q.bias)
137
+
138
+ def forward(self, x, c, attn_mask=None):
139
+ q = self.conv_q(x)
140
+ k = self.conv_k(c)
141
+ v = self.conv_v(c)
142
+
143
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
144
+
145
+ x = self.conv_o(x)
146
+ return x
147
+
148
+ def attention(self, query, key, value, mask=None):
149
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
150
+ b, d, t_s, t_t = (*key.size(), query.size(2))
151
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
152
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
153
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
154
+
155
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
156
+ if self.window_size is not None:
157
+ assert t_s == t_t, "Relative attention is only available for self-attention."
158
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
159
+ rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
160
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
161
+ scores = scores + scores_local
162
+ if self.proximal_bias:
163
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
164
+ scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
165
+ if mask is not None:
166
+ scores = scores.masked_fill(mask == 0, -1e4)
167
+ if self.block_length is not None:
168
+ assert t_s == t_t, "Local attention is only available for self-attention."
169
+ block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
170
+ scores = scores.masked_fill(block_mask == 0, -1e4)
171
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
172
+ p_attn = self.drop(p_attn)
173
+ output = torch.matmul(p_attn, value)
174
+ if self.window_size is not None:
175
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
176
+ value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
177
+ output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
178
+ output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
179
+ return output, p_attn
180
+
181
+ def _matmul_with_relative_values(self, x, y):
182
+ """
183
+ x: [b, h, l, m]
184
+ y: [h or 1, m, d]
185
+ ret: [b, h, l, d]
186
+ """
187
+ ret = torch.matmul(x, y.unsqueeze(0))
188
+ return ret
189
+
190
+ def _matmul_with_relative_keys(self, x, y):
191
+ """
192
+ x: [b, h, l, d]
193
+ y: [h or 1, m, d]
194
+ ret: [b, h, l, m]
195
+ """
196
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
197
+ return ret
198
+
199
+ def _get_relative_embeddings(self, relative_embeddings, length):
200
+ max_relative_position = 2 * self.window_size + 1
201
+ # Pad first before slice to avoid using cond ops.
202
+ pad_length = max(length - (self.window_size + 1), 0)
203
+ slice_start_position = max((self.window_size + 1) - length, 0)
204
+ slice_end_position = slice_start_position + 2 * length - 1
205
+ if pad_length > 0:
206
+ padded_relative_embeddings = F.pad(
207
+ relative_embeddings,
208
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
209
+ else:
210
+ padded_relative_embeddings = relative_embeddings
211
+ used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
212
+ return used_relative_embeddings
213
+
214
+ def _relative_position_to_absolute_position(self, x):
215
+ """
216
+ x: [b, h, l, 2*l-1]
217
+ ret: [b, h, l, l]
218
+ """
219
+ batch, heads, length, _ = x.size()
220
+ # Concat columns of pad to shift from relative to absolute indexing.
221
+ x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
222
+
223
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
224
+ x_flat = x.view([batch, heads, length * 2 * length])
225
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
226
+
227
+ # Reshape and slice out the padded elements.
228
+ x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
229
+ return x_final
230
+
231
+ def _absolute_position_to_relative_position(self, x):
232
+ """
233
+ x: [b, h, l, l]
234
+ ret: [b, h, l, 2*l-1]
235
+ """
236
+ batch, heads, length, _ = x.size()
237
+ # padd along column
238
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
239
+ x_flat = x.view([batch, heads, length**2 + length*(length -1)])
240
+ # add 0's in the beginning that will skew the elements after reshape
241
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
242
+ x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
243
+ return x_final
244
+
245
+ def _attention_bias_proximal(self, length):
246
+ """Bias for self-attention to encourage attention to close positions.
247
+ Args:
248
+ length: an integer scalar.
249
+ Returns:
250
+ a Tensor with shape [1, 1, length, length]
251
+ """
252
+ r = torch.arange(length, dtype=torch.float32)
253
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
254
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
255
+
256
+
257
+ class FFN(nn.Module):
258
+ def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
259
+ super().__init__()
260
+ self.in_channels = in_channels
261
+ self.out_channels = out_channels
262
+ self.filter_channels = filter_channels
263
+ self.kernel_size = kernel_size
264
+ self.p_dropout = p_dropout
265
+ self.activation = activation
266
+ self.causal = causal
267
+
268
+ if causal:
269
+ self.padding = self._causal_padding
270
+ else:
271
+ self.padding = self._same_padding
272
+
273
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
274
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
275
+ self.drop = nn.Dropout(p_dropout)
276
+
277
+ def forward(self, x, x_mask):
278
+ x = self.conv_1(self.padding(x * x_mask))
279
+ if self.activation == "gelu":
280
+ x = x * torch.sigmoid(1.702 * x)
281
+ else:
282
+ x = torch.relu(x)
283
+ x = self.drop(x)
284
+ x = self.conv_2(self.padding(x * x_mask))
285
+ return x * x_mask
286
+
287
+ def _causal_padding(self, x):
288
+ if self.kernel_size == 1:
289
+ return x
290
+ pad_l = self.kernel_size - 1
291
+ pad_r = 0
292
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
293
+ x = F.pad(x, commons.convert_pad_shape(padding))
294
+ return x
295
+
296
+ def _same_padding(self, x):
297
+ if self.kernel_size == 1:
298
+ return x
299
+ pad_l = (self.kernel_size - 1) // 2
300
+ pad_r = self.kernel_size // 2
301
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
302
+ x = F.pad(x, commons.convert_pad_shape(padding))
303
+ return x