Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- .gitattributes +1 -0
- ch09util.py +340 -0
- dict.p +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
dict.p filter=lfs diff=lfs merge=lfs -text
|
ch09util.py
ADDED
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import torch
|
3 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
def subsequent_mask(size):
|
7 |
+
attn_shape = (1, size, size)
|
8 |
+
subsequent_mask = np.triu(np.ones(attn_shape),
|
9 |
+
k=1).astype('uint8')
|
10 |
+
output = torch.from_numpy(subsequent_mask) == 0
|
11 |
+
return output
|
12 |
+
|
13 |
+
|
14 |
+
def make_std_mask(tgt, pad):
|
15 |
+
tgt_mask=(tgt != pad).unsqueeze(-2)
|
16 |
+
output=tgt_mask & subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data)
|
17 |
+
return output
|
18 |
+
|
19 |
+
# define the Batch class
|
20 |
+
class Batch:
|
21 |
+
def __init__(self, src, trg=None, pad=0):
|
22 |
+
src = torch.from_numpy(src).to(DEVICE).long()
|
23 |
+
self.src = src
|
24 |
+
self.src_mask = (src != pad).unsqueeze(-2)
|
25 |
+
if trg is not None:
|
26 |
+
trg = torch.from_numpy(trg).to(DEVICE).long()
|
27 |
+
self.trg = trg[:, :-1]
|
28 |
+
self.trg_y = trg[:, 1:]
|
29 |
+
self.trg_mask = make_std_mask(self.trg, pad)
|
30 |
+
self.ntokens = (self.trg_y != pad).data.sum()
|
31 |
+
|
32 |
+
from torch import nn
|
33 |
+
# An encoder-decoder transformer
|
34 |
+
class Transformer(nn.Module):
|
35 |
+
def __init__(self, encoder, decoder,
|
36 |
+
src_embed, tgt_embed, generator):
|
37 |
+
super().__init__()
|
38 |
+
self.encoder = encoder
|
39 |
+
self.decoder = decoder
|
40 |
+
self.src_embed = src_embed
|
41 |
+
self.tgt_embed = tgt_embed
|
42 |
+
self.generator = generator
|
43 |
+
|
44 |
+
def encode(self, src, src_mask):
|
45 |
+
return self.encoder(self.src_embed(src), src_mask)
|
46 |
+
|
47 |
+
def decode(self, memory, src_mask, tgt, tgt_mask):
|
48 |
+
return self.decoder(self.tgt_embed(tgt),
|
49 |
+
memory, src_mask, tgt_mask)
|
50 |
+
|
51 |
+
def forward(self, src, tgt, src_mask, tgt_mask):
|
52 |
+
memory = self.encode(src, src_mask)
|
53 |
+
output = self.decode(memory, src_mask, tgt, tgt_mask)
|
54 |
+
return output
|
55 |
+
|
56 |
+
# Create an encoder
|
57 |
+
from copy import deepcopy
|
58 |
+
class Encoder(nn.Module):
|
59 |
+
def __init__(self, layer, N):
|
60 |
+
super().__init__()
|
61 |
+
self.layers = nn.ModuleList(
|
62 |
+
[deepcopy(layer) for i in range(N)])
|
63 |
+
self.norm = LayerNorm(layer.size)
|
64 |
+
|
65 |
+
def forward(self, x, mask):
|
66 |
+
for layer in self.layers:
|
67 |
+
x = layer(x, mask)
|
68 |
+
output = self.norm(x)
|
69 |
+
return output
|
70 |
+
|
71 |
+
class EncoderLayer(nn.Module):
|
72 |
+
def __init__(self, size, self_attn, feed_forward, dropout):
|
73 |
+
super().__init__()
|
74 |
+
self.self_attn = self_attn
|
75 |
+
self.feed_forward = feed_forward
|
76 |
+
self.sublayer = nn.ModuleList([deepcopy(
|
77 |
+
SublayerConnection(size, dropout)) for i in range(2)])
|
78 |
+
self.size = size
|
79 |
+
|
80 |
+
def forward(self, x, mask):
|
81 |
+
x = self.sublayer[0](
|
82 |
+
x, lambda x: self.self_attn(x, x, x, mask))
|
83 |
+
output = self.sublayer[1](x, self.feed_forward)
|
84 |
+
return output
|
85 |
+
|
86 |
+
class SublayerConnection(nn.Module):
|
87 |
+
def __init__(self, size, dropout):
|
88 |
+
super().__init__()
|
89 |
+
self.norm = LayerNorm(size)
|
90 |
+
self.dropout = nn.Dropout(dropout)
|
91 |
+
|
92 |
+
def forward(self, x, sublayer):
|
93 |
+
output = x + self.dropout(sublayer(self.norm(x)))
|
94 |
+
return output
|
95 |
+
|
96 |
+
class LayerNorm(nn.Module):
|
97 |
+
def __init__(self, features, eps=1e-6):
|
98 |
+
super().__init__()
|
99 |
+
self.a_2 = nn.Parameter(torch.ones(features))
|
100 |
+
self.b_2 = nn.Parameter(torch.zeros(features))
|
101 |
+
self.eps = eps
|
102 |
+
|
103 |
+
def forward(self, x):
|
104 |
+
mean = x.mean(-1, keepdim=True)
|
105 |
+
std = x.std(-1, keepdim=True)
|
106 |
+
x_zscore = (x - mean) / torch.sqrt(std ** 2 + self.eps)
|
107 |
+
output = self.a_2*x_zscore+self.b_2
|
108 |
+
return output
|
109 |
+
|
110 |
+
# Create a decoder
|
111 |
+
class Decoder(nn.Module):
|
112 |
+
def __init__(self, layer, N):
|
113 |
+
super().__init__()
|
114 |
+
self.layers = nn.ModuleList(
|
115 |
+
[deepcopy(layer) for i in range(N)])
|
116 |
+
self.norm = LayerNorm(layer.size)
|
117 |
+
|
118 |
+
def forward(self, x, memory, src_mask, tgt_mask):
|
119 |
+
for layer in self.layers:
|
120 |
+
x = layer(x, memory, src_mask, tgt_mask)
|
121 |
+
output = self.norm(x)
|
122 |
+
return output
|
123 |
+
|
124 |
+
class DecoderLayer(nn.Module):
|
125 |
+
def __init__(self, size, self_attn, src_attn,
|
126 |
+
feed_forward, dropout):
|
127 |
+
super().__init__()
|
128 |
+
self.size = size
|
129 |
+
self.self_attn = self_attn
|
130 |
+
self.src_attn = src_attn
|
131 |
+
self.feed_forward = feed_forward
|
132 |
+
self.sublayer = nn.ModuleList([deepcopy(
|
133 |
+
SublayerConnection(size, dropout)) for i in range(3)])
|
134 |
+
|
135 |
+
def forward(self, x, memory, src_mask, tgt_mask):
|
136 |
+
x = self.sublayer[0](x, lambda x:
|
137 |
+
self.self_attn(x, x, x, tgt_mask))
|
138 |
+
x = self.sublayer[1](x, lambda x:
|
139 |
+
self.src_attn(x, memory, memory, src_mask))
|
140 |
+
output = self.sublayer[2](x, self.feed_forward)
|
141 |
+
return output
|
142 |
+
|
143 |
+
|
144 |
+
|
145 |
+
# create the model
|
146 |
+
def create_model(src_vocab, tgt_vocab, N, d_model,
|
147 |
+
d_ff, h, dropout=0.1):
|
148 |
+
attn=MultiHeadedAttention(h, d_model).to(DEVICE)
|
149 |
+
ff=PositionwiseFeedForward(d_model, d_ff, dropout).to(DEVICE)
|
150 |
+
pos=PositionalEncoding(d_model, dropout).to(DEVICE)
|
151 |
+
model = Transformer(
|
152 |
+
Encoder(EncoderLayer(d_model,deepcopy(attn),deepcopy(ff),
|
153 |
+
dropout).to(DEVICE),N).to(DEVICE),
|
154 |
+
Decoder(DecoderLayer(d_model,deepcopy(attn),
|
155 |
+
deepcopy(attn),deepcopy(ff), dropout).to(DEVICE),
|
156 |
+
N).to(DEVICE),
|
157 |
+
nn.Sequential(Embeddings(d_model, src_vocab).to(DEVICE),
|
158 |
+
deepcopy(pos)),
|
159 |
+
nn.Sequential(Embeddings(d_model, tgt_vocab).to(DEVICE),
|
160 |
+
deepcopy(pos)),
|
161 |
+
Generator(d_model, tgt_vocab)).to(DEVICE)
|
162 |
+
for p in model.parameters():
|
163 |
+
if p.dim() > 1:
|
164 |
+
nn.init.xavier_uniform_(p)
|
165 |
+
return model.to(DEVICE)
|
166 |
+
|
167 |
+
import math
|
168 |
+
class Embeddings(nn.Module):
|
169 |
+
def __init__(self, d_model, vocab):
|
170 |
+
super().__init__()
|
171 |
+
self.lut = nn.Embedding(vocab, d_model)
|
172 |
+
self.d_model = d_model
|
173 |
+
|
174 |
+
def forward(self, x):
|
175 |
+
out = self.lut(x) * math.sqrt(self.d_model)
|
176 |
+
return out
|
177 |
+
|
178 |
+
|
179 |
+
|
180 |
+
class PositionalEncoding(nn.Module):
|
181 |
+
def __init__(self, d_model, dropout, max_len=5000):
|
182 |
+
super().__init__()
|
183 |
+
self.dropout = nn.Dropout(p=dropout)
|
184 |
+
pe = torch.zeros(max_len, d_model, device=DEVICE)
|
185 |
+
position = torch.arange(0., max_len,
|
186 |
+
device=DEVICE).unsqueeze(1)
|
187 |
+
div_term = torch.exp(torch.arange(
|
188 |
+
0., d_model, 2, device=DEVICE)
|
189 |
+
* -(math.log(10000.0) / d_model))
|
190 |
+
pe_pos = torch.mul(position, div_term)
|
191 |
+
pe[:, 0::2] = torch.sin(pe_pos)
|
192 |
+
pe[:, 1::2] = torch.cos(pe_pos)
|
193 |
+
pe = pe.unsqueeze(0)
|
194 |
+
self.register_buffer('pe', pe)
|
195 |
+
|
196 |
+
def forward(self, x):
|
197 |
+
x = x + self.pe[:, :x.size(1)].requires_grad_(False)
|
198 |
+
out = self.dropout(x)
|
199 |
+
return out
|
200 |
+
|
201 |
+
def attention(query, key, value, mask=None, dropout=None):
|
202 |
+
d_k = query.size(-1)
|
203 |
+
scores = torch.matmul(query,
|
204 |
+
key.transpose(-2, -1)) / math.sqrt(d_k)
|
205 |
+
if mask is not None:
|
206 |
+
scores = scores.masked_fill(mask == 0, -1e9)
|
207 |
+
p_attn = nn.functional.softmax(scores, dim=-1)
|
208 |
+
if dropout is not None:
|
209 |
+
p_attn = dropout(p_attn)
|
210 |
+
return torch.matmul(p_attn, value), p_attn
|
211 |
+
|
212 |
+
class MultiHeadedAttention(nn.Module):
|
213 |
+
def __init__(self, h, d_model, dropout=0.1):
|
214 |
+
super().__init__()
|
215 |
+
assert d_model % h == 0
|
216 |
+
self.d_k = d_model // h
|
217 |
+
self.h = h
|
218 |
+
self.linears = nn.ModuleList([deepcopy(
|
219 |
+
nn.Linear(d_model, d_model)) for i in range(4)])
|
220 |
+
self.attn = None
|
221 |
+
self.dropout = nn.Dropout(p=dropout)
|
222 |
+
|
223 |
+
def forward(self, query, key, value, mask=None):
|
224 |
+
if mask is not None:
|
225 |
+
mask = mask.unsqueeze(1)
|
226 |
+
nbatches = query.size(0)
|
227 |
+
query, key, value = [l(x).view(nbatches, -1, self.h,
|
228 |
+
self.d_k).transpose(1, 2)
|
229 |
+
for l, x in zip(self.linears, (query, key, value))]
|
230 |
+
x, self.attn = attention(
|
231 |
+
query, key, value, mask=mask, dropout=self.dropout)
|
232 |
+
x = x.transpose(1, 2).contiguous().view(
|
233 |
+
nbatches, -1, self.h * self.d_k)
|
234 |
+
output = self.linears[-1](x)
|
235 |
+
return output
|
236 |
+
|
237 |
+
class Generator(nn.Module):
|
238 |
+
def __init__(self, d_model, vocab):
|
239 |
+
super().__init__()
|
240 |
+
self.proj = nn.Linear(d_model, vocab)
|
241 |
+
|
242 |
+
def forward(self, x):
|
243 |
+
out = self.proj(x)
|
244 |
+
probs = nn.functional.log_softmax(out, dim=-1)
|
245 |
+
return probs
|
246 |
+
|
247 |
+
|
248 |
+
class PositionwiseFeedForward(nn.Module):
|
249 |
+
def __init__(self, d_model, d_ff, dropout=0.1):
|
250 |
+
super().__init__()
|
251 |
+
self.w_1 = nn.Linear(d_model, d_ff)
|
252 |
+
self.w_2 = nn.Linear(d_ff, d_model)
|
253 |
+
self.dropout = nn.Dropout(dropout)
|
254 |
+
|
255 |
+
def forward(self, x):
|
256 |
+
h1 = self.w_1(x)
|
257 |
+
h2 = self.dropout(h1)
|
258 |
+
return self.w_2(h2)
|
259 |
+
|
260 |
+
|
261 |
+
|
262 |
+
|
263 |
+
class LabelSmoothing(nn.Module):
|
264 |
+
def __init__(self, size, padding_idx, smoothing=0.1):
|
265 |
+
super().__init__()
|
266 |
+
self.criterion = nn.KLDivLoss(reduction='sum')
|
267 |
+
self.padding_idx = padding_idx
|
268 |
+
self.confidence = 1.0 - smoothing
|
269 |
+
self.smoothing = smoothing
|
270 |
+
self.size = size
|
271 |
+
self.true_dist = None
|
272 |
+
|
273 |
+
def forward(self, x, target):
|
274 |
+
assert x.size(1) == self.size
|
275 |
+
true_dist = x.data.clone()
|
276 |
+
true_dist.fill_(self.smoothing / (self.size - 2))
|
277 |
+
true_dist.scatter_(1,
|
278 |
+
target.data.unsqueeze(1), self.confidence)
|
279 |
+
true_dist[:, self.padding_idx] = 0
|
280 |
+
mask = torch.nonzero(target.data == self.padding_idx)
|
281 |
+
if mask.dim() > 0:
|
282 |
+
true_dist.index_fill_(0, mask.squeeze(), 0.0)
|
283 |
+
self.true_dist = true_dist
|
284 |
+
output = self.criterion(x, true_dist.clone().detach())
|
285 |
+
return output
|
286 |
+
|
287 |
+
class SimpleLossCompute:
|
288 |
+
def __init__(self, generator, criterion, opt=None):
|
289 |
+
self.generator = generator
|
290 |
+
self.criterion = criterion
|
291 |
+
self.opt = opt
|
292 |
+
|
293 |
+
def __call__(self, x, y, norm):
|
294 |
+
x = self.generator(x)
|
295 |
+
loss = self.criterion(x.contiguous().view(-1, x.size(-1)),
|
296 |
+
y.contiguous().view(-1)) / norm
|
297 |
+
loss.backward()
|
298 |
+
if self.opt is not None:
|
299 |
+
self.opt.step()
|
300 |
+
self.opt.optimizer.zero_grad()
|
301 |
+
return loss.data.item() * norm.float()
|
302 |
+
|
303 |
+
class NoamOpt:
|
304 |
+
def __init__(self, model_size, factor, warmup, optimizer):
|
305 |
+
self.optimizer = optimizer
|
306 |
+
self._step = 0
|
307 |
+
self.warmup = warmup
|
308 |
+
self.factor = factor
|
309 |
+
self.model_size = model_size
|
310 |
+
self._rate = 0
|
311 |
+
|
312 |
+
def step(self):
|
313 |
+
self._step += 1
|
314 |
+
rate = self.rate()
|
315 |
+
for p in self.optimizer.param_groups:
|
316 |
+
p['lr'] = rate
|
317 |
+
self._rate = rate
|
318 |
+
self.optimizer.step()
|
319 |
+
|
320 |
+
def rate(self, step=None):
|
321 |
+
if step is None:
|
322 |
+
step = self._step
|
323 |
+
output = self.factor * (self.model_size ** (-0.5) *
|
324 |
+
min(step ** (-0.5), step * self.warmup ** (-1.5)))
|
325 |
+
return output
|
326 |
+
|
327 |
+
|
328 |
+
|
329 |
+
|
330 |
+
|
331 |
+
|
332 |
+
|
333 |
+
|
334 |
+
|
335 |
+
|
336 |
+
|
337 |
+
|
338 |
+
|
339 |
+
|
340 |
+
|
dict.p
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6c749ac01743123bd4a4683e075dbd828745e122146f160835149737f8bf23f2
|
3 |
+
size 492544
|