Upload model
Browse files- config.json +27 -0
- config.py +58 -0
- model.py +30 -0
- original.py +247 -0
- pytorch_model.bin +3 -0
config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"adpt": 0.1,
|
3 |
+
"afn": "gelu",
|
4 |
+
"architectures": [
|
5 |
+
"HF_LMModel"
|
6 |
+
],
|
7 |
+
"auto_map": {
|
8 |
+
"AutoConfig": "config.COMET19_CN_Config",
|
9 |
+
"AutoModel": "model.HF_LMModel"
|
10 |
+
},
|
11 |
+
"edpt": 0.1,
|
12 |
+
"hSize": 768,
|
13 |
+
"init": "pt",
|
14 |
+
"model": "transformer",
|
15 |
+
"nH": 12,
|
16 |
+
"nL": 12,
|
17 |
+
"n_ctx": 31,
|
18 |
+
"n_vocab": 40545,
|
19 |
+
"odpt": 0.1,
|
20 |
+
"pt": "gpt",
|
21 |
+
"rdpt": 0.1,
|
22 |
+
"return_acts": true,
|
23 |
+
"return_probs": false,
|
24 |
+
"torch_dtype": "float32",
|
25 |
+
"transformers_version": "4.25.1",
|
26 |
+
"vSize": 40545
|
27 |
+
}
|
config.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import PretrainedConfig
|
2 |
+
|
3 |
+
class COMET19_CN_Config(PretrainedConfig):
|
4 |
+
def __init__(
|
5 |
+
self,
|
6 |
+
model: str = "transformer",
|
7 |
+
nL: int = 12,
|
8 |
+
nH: int = 12,
|
9 |
+
hSize: int = 768,
|
10 |
+
edpt: float = 0.1,
|
11 |
+
adpt: float = 0.1,
|
12 |
+
rdpt: float = 0.1,
|
13 |
+
odpt: float = 0.1,
|
14 |
+
pt: str = "gpt",
|
15 |
+
afn: str = "gelu",
|
16 |
+
init: str = "pt",
|
17 |
+
vSize: int = 40545,
|
18 |
+
n_ctx: int = 31,
|
19 |
+
n_vocab: int = 40545,
|
20 |
+
return_acts: bool = True,
|
21 |
+
return_probs: bool = False,
|
22 |
+
**kwargs,
|
23 |
+
):
|
24 |
+
self.model = model
|
25 |
+
self.nL = nL
|
26 |
+
self.nH = nH
|
27 |
+
self.hSize = hSize
|
28 |
+
self.edpt = edpt
|
29 |
+
self.adpt = adpt
|
30 |
+
self.rdpt = rdpt
|
31 |
+
self.odpt = odpt
|
32 |
+
self.pt = pt
|
33 |
+
self.afn = afn
|
34 |
+
self.init = init
|
35 |
+
self.vSize = vSize
|
36 |
+
self.n_ctx = n_ctx
|
37 |
+
self.n_vocab = n_vocab
|
38 |
+
self.return_acts = return_acts
|
39 |
+
self.return_probs = return_probs
|
40 |
+
super().__init__(**kwargs)
|
41 |
+
|
42 |
+
|
43 |
+
def parse_net_config(config):
|
44 |
+
return {
|
45 |
+
'model': config.model,
|
46 |
+
'nL': config.nL,
|
47 |
+
'nH': config.nH,
|
48 |
+
'hSize': config.hSize,
|
49 |
+
'edpt': config.edpt,
|
50 |
+
'adpt': config.adpt,
|
51 |
+
'rdpt': config.rdpt,
|
52 |
+
'odpt': config.odpt,
|
53 |
+
'pt': config.pt,
|
54 |
+
'afn': config.afn,
|
55 |
+
'init': config.init,
|
56 |
+
'vSize': config.vSize,
|
57 |
+
'n_ctx': config.n_ctx,
|
58 |
+
}
|
model.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as F
|
3 |
+
|
4 |
+
from transformers import PreTrainedModel
|
5 |
+
from .original import TransformerModel, LMHead
|
6 |
+
'''
|
7 |
+
Code for HuggingFace Hub Compatability
|
8 |
+
'''
|
9 |
+
|
10 |
+
class HF_LMModel(PreTrainedModel):
|
11 |
+
""" Transformer with language model head only """
|
12 |
+
def __init__(self, config):
|
13 |
+
super().__init__(config)
|
14 |
+
self.transformer = TransformerModel(config, vocab=config.n_vocab, n_ctx=config.n_ctx)
|
15 |
+
self.lm_head = LMHead(self.transformer, config, trunc_and_reshape=False)
|
16 |
+
self.return_probs = config.return_probs
|
17 |
+
self.return_acts = config.return_acts
|
18 |
+
if self.return_probs or self.return_acts:
|
19 |
+
pos_emb_mask = torch.zeros(1, 1, config.n_vocab)
|
20 |
+
pos_emb_mask[:, :, -config.n_ctx:] = -1e12
|
21 |
+
self.register_buffer('pos_emb_mask', pos_emb_mask)
|
22 |
+
|
23 |
+
def forward(self, x, sequence_mask=None):
|
24 |
+
h = self.transformer(x, sequence_mask)
|
25 |
+
lm_logits = self.lm_head(h)
|
26 |
+
if self.return_probs:
|
27 |
+
lm_logits = F.softmax(lm_logits + self.pos_emb_mask, dim=-1)
|
28 |
+
elif self.return_acts:
|
29 |
+
lm_logits = lm_logits + self.pos_emb_mask
|
30 |
+
return { "logits": lm_logits }
|
original.py
ADDED
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import math
|
3 |
+
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
import torch.nn.functional as F
|
7 |
+
from torch.nn.parameter import Parameter
|
8 |
+
|
9 |
+
|
10 |
+
'''
|
11 |
+
Much of this code is taken from HuggingFace's OpenAI LM Implementation here:
|
12 |
+
|
13 |
+
https://github.com/huggingface/pytorch-openai-transformer-lm
|
14 |
+
'''
|
15 |
+
|
16 |
+
|
17 |
+
def gelu(x):
|
18 |
+
return (0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) *
|
19 |
+
(x + 0.044715 * torch.pow(x, 3)))))
|
20 |
+
|
21 |
+
|
22 |
+
def swish(x):
|
23 |
+
return x * torch.sigmoid(x)
|
24 |
+
|
25 |
+
|
26 |
+
ACT_FNS = {
|
27 |
+
'relu': nn.ReLU,
|
28 |
+
'swish': swish,
|
29 |
+
'gelu': gelu
|
30 |
+
}
|
31 |
+
|
32 |
+
|
33 |
+
class LayerNorm(nn.Module):
|
34 |
+
"Construct a layernorm module in the OpenAI style \
|
35 |
+
(epsilon inside the square root)."
|
36 |
+
|
37 |
+
def __init__(self, n_state, e=1e-5):
|
38 |
+
super(LayerNorm, self).__init__()
|
39 |
+
self.g = nn.Parameter(torch.ones(n_state))
|
40 |
+
self.b = nn.Parameter(torch.zeros(n_state))
|
41 |
+
self.e = e
|
42 |
+
|
43 |
+
def forward(self, x):
|
44 |
+
u = x.mean(-1, keepdim=True)
|
45 |
+
s = (x - u).pow(2).mean(-1, keepdim=True)
|
46 |
+
x = (x - u) / torch.sqrt(s + self.e)
|
47 |
+
return self.g * x + self.b
|
48 |
+
|
49 |
+
|
50 |
+
class Conv1D(nn.Module):
|
51 |
+
def __init__(self, nf, rf, nx):
|
52 |
+
super(Conv1D, self).__init__()
|
53 |
+
self.rf = rf
|
54 |
+
self.nf = nf
|
55 |
+
if rf == 1: # faster 1x1 conv
|
56 |
+
w = torch.empty(nx, nf)
|
57 |
+
nn.init.normal_(w, std=0.02)
|
58 |
+
self.w = Parameter(w)
|
59 |
+
self.b = Parameter(torch.zeros(nf))
|
60 |
+
else: # was used to train LM
|
61 |
+
raise NotImplementedError
|
62 |
+
|
63 |
+
def forward(self, x):
|
64 |
+
if self.rf == 1:
|
65 |
+
size_out = x.size()[:-1] + (self.nf,)
|
66 |
+
x = torch.addmm(self.b, x.view(-1, x.size(-1)), self.w)
|
67 |
+
x = x.view(*size_out)
|
68 |
+
else:
|
69 |
+
raise NotImplementedError
|
70 |
+
return x
|
71 |
+
|
72 |
+
|
73 |
+
class Attention(nn.Module):
|
74 |
+
def __init__(self, nx, n_ctx, cfg, scale=False):
|
75 |
+
super(Attention, self).__init__()
|
76 |
+
n_state = nx # in Attention: n_state=768 (nx=n_embd)
|
77 |
+
|
78 |
+
assert n_state % cfg.nH == 0
|
79 |
+
self.register_buffer('b', torch.tril(torch.ones(
|
80 |
+
n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
|
81 |
+
self.n_head = cfg.nH
|
82 |
+
self.split_size = n_state
|
83 |
+
self.scale = scale
|
84 |
+
self.c_attn = Conv1D(n_state * 3, 1, nx)
|
85 |
+
self.c_proj = Conv1D(n_state, 1, nx)
|
86 |
+
self.attn_dropout = nn.Dropout(cfg.adpt)
|
87 |
+
self.resid_dropout = nn.Dropout(cfg.rdpt)
|
88 |
+
|
89 |
+
# dimensions of w: (batch_size x num_heads x seq_length x seq_length)
|
90 |
+
def _attn(self, q, k, v, sequence_mask):
|
91 |
+
w = torch.matmul(q, k)
|
92 |
+
if self.scale:
|
93 |
+
w = w / math.sqrt(v.size(-1))
|
94 |
+
|
95 |
+
b_subset = self.b[:, :, :w.size(-2), :w.size(-1)]
|
96 |
+
|
97 |
+
if sequence_mask is not None:
|
98 |
+
b_subset = b_subset * sequence_mask.view(
|
99 |
+
sequence_mask.size(0), 1, -1)
|
100 |
+
b_subset = b_subset.permute(1, 0, 2, 3)
|
101 |
+
|
102 |
+
w = w * b_subset + -1e9 * (1 - b_subset)
|
103 |
+
w = nn.Softmax(dim=-1)(w)
|
104 |
+
w = self.attn_dropout(w)
|
105 |
+
return torch.matmul(w, v)
|
106 |
+
|
107 |
+
def merge_heads(self, x):
|
108 |
+
x = x.permute(0, 2, 1, 3).contiguous()
|
109 |
+
new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
|
110 |
+
return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states
|
111 |
+
|
112 |
+
def split_heads(self, x, k=False):
|
113 |
+
new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
|
114 |
+
x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states
|
115 |
+
if k:
|
116 |
+
return x.permute(0, 2, 3, 1)
|
117 |
+
else:
|
118 |
+
return x.permute(0, 2, 1, 3)
|
119 |
+
|
120 |
+
def forward(self, x, sequence_mask):
|
121 |
+
x = self.c_attn(x)
|
122 |
+
query, key, value = x.split(self.split_size, dim=2)
|
123 |
+
query = self.split_heads(query)
|
124 |
+
key = self.split_heads(key, k=True)
|
125 |
+
value = self.split_heads(value)
|
126 |
+
a = self._attn(query, key, value, sequence_mask)
|
127 |
+
a = self.merge_heads(a)
|
128 |
+
a = self.c_proj(a)
|
129 |
+
a = self.resid_dropout(a)
|
130 |
+
return a
|
131 |
+
|
132 |
+
|
133 |
+
class MLP(nn.Module):
|
134 |
+
def __init__(self, n_state, cfg): # in MLP: n_state=3072 (4 * n_embd)
|
135 |
+
super(MLP, self).__init__()
|
136 |
+
nx = cfg.hSize
|
137 |
+
self.c_fc = Conv1D(n_state, 1, nx)
|
138 |
+
self.c_proj = Conv1D(nx, 1, n_state)
|
139 |
+
self.act = ACT_FNS[cfg.afn]
|
140 |
+
self.dropout = nn.Dropout(cfg.rdpt)
|
141 |
+
|
142 |
+
def forward(self, x):
|
143 |
+
h = self.act(self.c_fc(x))
|
144 |
+
h2 = self.c_proj(h)
|
145 |
+
return self.dropout(h2)
|
146 |
+
|
147 |
+
|
148 |
+
class Block(nn.Module):
|
149 |
+
def __init__(self, n_ctx, cfg, scale=False):
|
150 |
+
super(Block, self).__init__()
|
151 |
+
nx = cfg.hSize
|
152 |
+
self.attn = Attention(nx, n_ctx, cfg, scale)
|
153 |
+
self.ln_1 = LayerNorm(nx)
|
154 |
+
self.mlp = MLP(4 * nx, cfg)
|
155 |
+
self.ln_2 = LayerNorm(nx)
|
156 |
+
|
157 |
+
def forward(self, x, sequence_mask):
|
158 |
+
a = self.attn(x, sequence_mask)
|
159 |
+
n = self.ln_1(x + a)
|
160 |
+
m = self.mlp(n)
|
161 |
+
h = self.ln_2(n + m)
|
162 |
+
return h
|
163 |
+
|
164 |
+
|
165 |
+
class TransformerModel(nn.Module):
|
166 |
+
""" Transformer model """
|
167 |
+
|
168 |
+
def __init__(self, cfg, vocab=40990, n_ctx=512):
|
169 |
+
super(TransformerModel, self).__init__()
|
170 |
+
self.vocab = vocab
|
171 |
+
self.embed = nn.Embedding(vocab, cfg.hSize)
|
172 |
+
self.drop = nn.Dropout(cfg.edpt)
|
173 |
+
block = Block(n_ctx, cfg, scale=True)
|
174 |
+
self.h = nn.ModuleList([copy.deepcopy(block)
|
175 |
+
for _ in range(cfg.nL)])
|
176 |
+
|
177 |
+
nn.init.normal_(self.embed.weight, std=0.02)
|
178 |
+
|
179 |
+
def forward(self, x, sequence_mask):
|
180 |
+
x = x.view(-1, x.size(-2), x.size(-1))
|
181 |
+
e = self.embed(x)
|
182 |
+
# Add the position information to the input embeddings
|
183 |
+
h = e.sum(dim=2)
|
184 |
+
for block in self.h:
|
185 |
+
h = block(h, sequence_mask)
|
186 |
+
return h
|
187 |
+
|
188 |
+
|
189 |
+
class LMModel(nn.Module):
|
190 |
+
""" Transformer with language model head only """
|
191 |
+
def __init__(self, cfg, vocab=40990, n_ctx=512,
|
192 |
+
return_probs=False, return_acts=False):
|
193 |
+
super(LMModel, self).__init__()
|
194 |
+
self.transformer = TransformerModel(cfg, vocab=vocab, n_ctx=n_ctx)
|
195 |
+
self.lm_head = LMHead(self.transformer, cfg, trunc_and_reshape=False)
|
196 |
+
self.return_probs = return_probs
|
197 |
+
self.return_acts = return_acts
|
198 |
+
if self.return_probs or self.return_acts:
|
199 |
+
pos_emb_mask = torch.zeros(1, 1, vocab)
|
200 |
+
pos_emb_mask[:, :, -n_ctx:] = -1e12
|
201 |
+
self.register_buffer('pos_emb_mask', pos_emb_mask)
|
202 |
+
|
203 |
+
def forward(self, x, sequence_mask=None):
|
204 |
+
h = self.transformer(x, sequence_mask)
|
205 |
+
lm_logits = self.lm_head(h)
|
206 |
+
if self.return_probs:
|
207 |
+
lm_logits = F.softmax(lm_logits + self.pos_emb_mask, dim=-1)
|
208 |
+
elif self.return_acts:
|
209 |
+
lm_logits = lm_logits + self.pos_emb_mask
|
210 |
+
return lm_logits
|
211 |
+
|
212 |
+
|
213 |
+
class LMHead(nn.Module):
|
214 |
+
""" Language Model Head for the transformer """
|
215 |
+
|
216 |
+
def __init__(self, model, cfg, trunc_and_reshape=True):
|
217 |
+
super(LMHead, self).__init__()
|
218 |
+
self.n_embd = cfg.hSize
|
219 |
+
embed_shape = model.embed.weight.shape
|
220 |
+
self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
|
221 |
+
self.decoder.weight = model.embed.weight # Tied weights
|
222 |
+
self.trunc_and_reshape = trunc_and_reshape # XD
|
223 |
+
|
224 |
+
def forward(self, h):
|
225 |
+
# Truncated Language modeling logits (we remove the last token)
|
226 |
+
h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd) \
|
227 |
+
if self.trunc_and_reshape else h # XD
|
228 |
+
lm_logits = self.decoder(h_trunc)
|
229 |
+
return lm_logits
|
230 |
+
|
231 |
+
|
232 |
+
class dotdict(dict):
|
233 |
+
"""dot.notation access to dictionary attributes"""
|
234 |
+
__getattr__ = dict.get
|
235 |
+
__setattr__ = dict.__setitem__
|
236 |
+
__delattr__ = dict.__delitem__
|
237 |
+
|
238 |
+
|
239 |
+
DEFAULT_CONFIG = dotdict({
|
240 |
+
'n_embd': 768,
|
241 |
+
'n_head': 12,
|
242 |
+
'n_layer': 12,
|
243 |
+
'embd_pdrop': 0.1,
|
244 |
+
'attn_pdrop': 0.1,
|
245 |
+
'resid_pdrop': 0.1,
|
246 |
+
'afn': 'gelu',
|
247 |
+
'clf_pdrop': 0.1})
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a295a5827a7c1e8f09031d9c45036c3a7047b59505f1966a829933fc59819961
|
3 |
+
size 465028869
|