fzbuzz commited on
Commit
5d30583
·
1 Parent(s): 2ac6c87

Upload model

Browse files
Files changed (5) hide show
  1. config.json +27 -0
  2. config.py +58 -0
  3. model.py +30 -0
  4. original.py +247 -0
  5. pytorch_model.bin +3 -0
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "adpt": 0.1,
3
+ "afn": "gelu",
4
+ "architectures": [
5
+ "HF_LMModel"
6
+ ],
7
+ "auto_map": {
8
+ "AutoConfig": "config.COMET19_CN_Config",
9
+ "AutoModel": "model.HF_LMModel"
10
+ },
11
+ "edpt": 0.1,
12
+ "hSize": 768,
13
+ "init": "pt",
14
+ "model": "transformer",
15
+ "nH": 12,
16
+ "nL": 12,
17
+ "n_ctx": 31,
18
+ "n_vocab": 40545,
19
+ "odpt": 0.1,
20
+ "pt": "gpt",
21
+ "rdpt": 0.1,
22
+ "return_acts": true,
23
+ "return_probs": false,
24
+ "torch_dtype": "float32",
25
+ "transformers_version": "4.25.1",
26
+ "vSize": 40545
27
+ }
config.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+ class COMET19_CN_Config(PretrainedConfig):
4
+ def __init__(
5
+ self,
6
+ model: str = "transformer",
7
+ nL: int = 12,
8
+ nH: int = 12,
9
+ hSize: int = 768,
10
+ edpt: float = 0.1,
11
+ adpt: float = 0.1,
12
+ rdpt: float = 0.1,
13
+ odpt: float = 0.1,
14
+ pt: str = "gpt",
15
+ afn: str = "gelu",
16
+ init: str = "pt",
17
+ vSize: int = 40545,
18
+ n_ctx: int = 31,
19
+ n_vocab: int = 40545,
20
+ return_acts: bool = True,
21
+ return_probs: bool = False,
22
+ **kwargs,
23
+ ):
24
+ self.model = model
25
+ self.nL = nL
26
+ self.nH = nH
27
+ self.hSize = hSize
28
+ self.edpt = edpt
29
+ self.adpt = adpt
30
+ self.rdpt = rdpt
31
+ self.odpt = odpt
32
+ self.pt = pt
33
+ self.afn = afn
34
+ self.init = init
35
+ self.vSize = vSize
36
+ self.n_ctx = n_ctx
37
+ self.n_vocab = n_vocab
38
+ self.return_acts = return_acts
39
+ self.return_probs = return_probs
40
+ super().__init__(**kwargs)
41
+
42
+
43
+ def parse_net_config(config):
44
+ return {
45
+ 'model': config.model,
46
+ 'nL': config.nL,
47
+ 'nH': config.nH,
48
+ 'hSize': config.hSize,
49
+ 'edpt': config.edpt,
50
+ 'adpt': config.adpt,
51
+ 'rdpt': config.rdpt,
52
+ 'odpt': config.odpt,
53
+ 'pt': config.pt,
54
+ 'afn': config.afn,
55
+ 'init': config.init,
56
+ 'vSize': config.vSize,
57
+ 'n_ctx': config.n_ctx,
58
+ }
model.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+
4
+ from transformers import PreTrainedModel
5
+ from .original import TransformerModel, LMHead
6
+ '''
7
+ Code for HuggingFace Hub Compatability
8
+ '''
9
+
10
+ class HF_LMModel(PreTrainedModel):
11
+ """ Transformer with language model head only """
12
+ def __init__(self, config):
13
+ super().__init__(config)
14
+ self.transformer = TransformerModel(config, vocab=config.n_vocab, n_ctx=config.n_ctx)
15
+ self.lm_head = LMHead(self.transformer, config, trunc_and_reshape=False)
16
+ self.return_probs = config.return_probs
17
+ self.return_acts = config.return_acts
18
+ if self.return_probs or self.return_acts:
19
+ pos_emb_mask = torch.zeros(1, 1, config.n_vocab)
20
+ pos_emb_mask[:, :, -config.n_ctx:] = -1e12
21
+ self.register_buffer('pos_emb_mask', pos_emb_mask)
22
+
23
+ def forward(self, x, sequence_mask=None):
24
+ h = self.transformer(x, sequence_mask)
25
+ lm_logits = self.lm_head(h)
26
+ if self.return_probs:
27
+ lm_logits = F.softmax(lm_logits + self.pos_emb_mask, dim=-1)
28
+ elif self.return_acts:
29
+ lm_logits = lm_logits + self.pos_emb_mask
30
+ return { "logits": lm_logits }
original.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+ from torch.nn.parameter import Parameter
8
+
9
+
10
+ '''
11
+ Much of this code is taken from HuggingFace's OpenAI LM Implementation here:
12
+
13
+ https://github.com/huggingface/pytorch-openai-transformer-lm
14
+ '''
15
+
16
+
17
+ def gelu(x):
18
+ return (0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) *
19
+ (x + 0.044715 * torch.pow(x, 3)))))
20
+
21
+
22
+ def swish(x):
23
+ return x * torch.sigmoid(x)
24
+
25
+
26
+ ACT_FNS = {
27
+ 'relu': nn.ReLU,
28
+ 'swish': swish,
29
+ 'gelu': gelu
30
+ }
31
+
32
+
33
+ class LayerNorm(nn.Module):
34
+ "Construct a layernorm module in the OpenAI style \
35
+ (epsilon inside the square root)."
36
+
37
+ def __init__(self, n_state, e=1e-5):
38
+ super(LayerNorm, self).__init__()
39
+ self.g = nn.Parameter(torch.ones(n_state))
40
+ self.b = nn.Parameter(torch.zeros(n_state))
41
+ self.e = e
42
+
43
+ def forward(self, x):
44
+ u = x.mean(-1, keepdim=True)
45
+ s = (x - u).pow(2).mean(-1, keepdim=True)
46
+ x = (x - u) / torch.sqrt(s + self.e)
47
+ return self.g * x + self.b
48
+
49
+
50
+ class Conv1D(nn.Module):
51
+ def __init__(self, nf, rf, nx):
52
+ super(Conv1D, self).__init__()
53
+ self.rf = rf
54
+ self.nf = nf
55
+ if rf == 1: # faster 1x1 conv
56
+ w = torch.empty(nx, nf)
57
+ nn.init.normal_(w, std=0.02)
58
+ self.w = Parameter(w)
59
+ self.b = Parameter(torch.zeros(nf))
60
+ else: # was used to train LM
61
+ raise NotImplementedError
62
+
63
+ def forward(self, x):
64
+ if self.rf == 1:
65
+ size_out = x.size()[:-1] + (self.nf,)
66
+ x = torch.addmm(self.b, x.view(-1, x.size(-1)), self.w)
67
+ x = x.view(*size_out)
68
+ else:
69
+ raise NotImplementedError
70
+ return x
71
+
72
+
73
+ class Attention(nn.Module):
74
+ def __init__(self, nx, n_ctx, cfg, scale=False):
75
+ super(Attention, self).__init__()
76
+ n_state = nx # in Attention: n_state=768 (nx=n_embd)
77
+
78
+ assert n_state % cfg.nH == 0
79
+ self.register_buffer('b', torch.tril(torch.ones(
80
+ n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
81
+ self.n_head = cfg.nH
82
+ self.split_size = n_state
83
+ self.scale = scale
84
+ self.c_attn = Conv1D(n_state * 3, 1, nx)
85
+ self.c_proj = Conv1D(n_state, 1, nx)
86
+ self.attn_dropout = nn.Dropout(cfg.adpt)
87
+ self.resid_dropout = nn.Dropout(cfg.rdpt)
88
+
89
+ # dimensions of w: (batch_size x num_heads x seq_length x seq_length)
90
+ def _attn(self, q, k, v, sequence_mask):
91
+ w = torch.matmul(q, k)
92
+ if self.scale:
93
+ w = w / math.sqrt(v.size(-1))
94
+
95
+ b_subset = self.b[:, :, :w.size(-2), :w.size(-1)]
96
+
97
+ if sequence_mask is not None:
98
+ b_subset = b_subset * sequence_mask.view(
99
+ sequence_mask.size(0), 1, -1)
100
+ b_subset = b_subset.permute(1, 0, 2, 3)
101
+
102
+ w = w * b_subset + -1e9 * (1 - b_subset)
103
+ w = nn.Softmax(dim=-1)(w)
104
+ w = self.attn_dropout(w)
105
+ return torch.matmul(w, v)
106
+
107
+ def merge_heads(self, x):
108
+ x = x.permute(0, 2, 1, 3).contiguous()
109
+ new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
110
+ return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states
111
+
112
+ def split_heads(self, x, k=False):
113
+ new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
114
+ x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states
115
+ if k:
116
+ return x.permute(0, 2, 3, 1)
117
+ else:
118
+ return x.permute(0, 2, 1, 3)
119
+
120
+ def forward(self, x, sequence_mask):
121
+ x = self.c_attn(x)
122
+ query, key, value = x.split(self.split_size, dim=2)
123
+ query = self.split_heads(query)
124
+ key = self.split_heads(key, k=True)
125
+ value = self.split_heads(value)
126
+ a = self._attn(query, key, value, sequence_mask)
127
+ a = self.merge_heads(a)
128
+ a = self.c_proj(a)
129
+ a = self.resid_dropout(a)
130
+ return a
131
+
132
+
133
+ class MLP(nn.Module):
134
+ def __init__(self, n_state, cfg): # in MLP: n_state=3072 (4 * n_embd)
135
+ super(MLP, self).__init__()
136
+ nx = cfg.hSize
137
+ self.c_fc = Conv1D(n_state, 1, nx)
138
+ self.c_proj = Conv1D(nx, 1, n_state)
139
+ self.act = ACT_FNS[cfg.afn]
140
+ self.dropout = nn.Dropout(cfg.rdpt)
141
+
142
+ def forward(self, x):
143
+ h = self.act(self.c_fc(x))
144
+ h2 = self.c_proj(h)
145
+ return self.dropout(h2)
146
+
147
+
148
+ class Block(nn.Module):
149
+ def __init__(self, n_ctx, cfg, scale=False):
150
+ super(Block, self).__init__()
151
+ nx = cfg.hSize
152
+ self.attn = Attention(nx, n_ctx, cfg, scale)
153
+ self.ln_1 = LayerNorm(nx)
154
+ self.mlp = MLP(4 * nx, cfg)
155
+ self.ln_2 = LayerNorm(nx)
156
+
157
+ def forward(self, x, sequence_mask):
158
+ a = self.attn(x, sequence_mask)
159
+ n = self.ln_1(x + a)
160
+ m = self.mlp(n)
161
+ h = self.ln_2(n + m)
162
+ return h
163
+
164
+
165
+ class TransformerModel(nn.Module):
166
+ """ Transformer model """
167
+
168
+ def __init__(self, cfg, vocab=40990, n_ctx=512):
169
+ super(TransformerModel, self).__init__()
170
+ self.vocab = vocab
171
+ self.embed = nn.Embedding(vocab, cfg.hSize)
172
+ self.drop = nn.Dropout(cfg.edpt)
173
+ block = Block(n_ctx, cfg, scale=True)
174
+ self.h = nn.ModuleList([copy.deepcopy(block)
175
+ for _ in range(cfg.nL)])
176
+
177
+ nn.init.normal_(self.embed.weight, std=0.02)
178
+
179
+ def forward(self, x, sequence_mask):
180
+ x = x.view(-1, x.size(-2), x.size(-1))
181
+ e = self.embed(x)
182
+ # Add the position information to the input embeddings
183
+ h = e.sum(dim=2)
184
+ for block in self.h:
185
+ h = block(h, sequence_mask)
186
+ return h
187
+
188
+
189
+ class LMModel(nn.Module):
190
+ """ Transformer with language model head only """
191
+ def __init__(self, cfg, vocab=40990, n_ctx=512,
192
+ return_probs=False, return_acts=False):
193
+ super(LMModel, self).__init__()
194
+ self.transformer = TransformerModel(cfg, vocab=vocab, n_ctx=n_ctx)
195
+ self.lm_head = LMHead(self.transformer, cfg, trunc_and_reshape=False)
196
+ self.return_probs = return_probs
197
+ self.return_acts = return_acts
198
+ if self.return_probs or self.return_acts:
199
+ pos_emb_mask = torch.zeros(1, 1, vocab)
200
+ pos_emb_mask[:, :, -n_ctx:] = -1e12
201
+ self.register_buffer('pos_emb_mask', pos_emb_mask)
202
+
203
+ def forward(self, x, sequence_mask=None):
204
+ h = self.transformer(x, sequence_mask)
205
+ lm_logits = self.lm_head(h)
206
+ if self.return_probs:
207
+ lm_logits = F.softmax(lm_logits + self.pos_emb_mask, dim=-1)
208
+ elif self.return_acts:
209
+ lm_logits = lm_logits + self.pos_emb_mask
210
+ return lm_logits
211
+
212
+
213
+ class LMHead(nn.Module):
214
+ """ Language Model Head for the transformer """
215
+
216
+ def __init__(self, model, cfg, trunc_and_reshape=True):
217
+ super(LMHead, self).__init__()
218
+ self.n_embd = cfg.hSize
219
+ embed_shape = model.embed.weight.shape
220
+ self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
221
+ self.decoder.weight = model.embed.weight # Tied weights
222
+ self.trunc_and_reshape = trunc_and_reshape # XD
223
+
224
+ def forward(self, h):
225
+ # Truncated Language modeling logits (we remove the last token)
226
+ h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd) \
227
+ if self.trunc_and_reshape else h # XD
228
+ lm_logits = self.decoder(h_trunc)
229
+ return lm_logits
230
+
231
+
232
+ class dotdict(dict):
233
+ """dot.notation access to dictionary attributes"""
234
+ __getattr__ = dict.get
235
+ __setattr__ = dict.__setitem__
236
+ __delattr__ = dict.__delitem__
237
+
238
+
239
+ DEFAULT_CONFIG = dotdict({
240
+ 'n_embd': 768,
241
+ 'n_head': 12,
242
+ 'n_layer': 12,
243
+ 'embd_pdrop': 0.1,
244
+ 'attn_pdrop': 0.1,
245
+ 'resid_pdrop': 0.1,
246
+ 'afn': 'gelu',
247
+ 'clf_pdrop': 0.1})
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a295a5827a7c1e8f09031d9c45036c3a7047b59505f1966a829933fc59819961
3
+ size 465028869