George-Ogden commited on
Commit
802062f
·
1 Parent(s): daae26a

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +56 -0
README.md ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ datasets:
4
+ - wikipedia
5
+ language:
6
+ - en
7
+ tags:
8
+ - research
9
+ ---
10
+ This model is significantly undertrained and designed for research purposes only.
11
+ For use in transformers:
12
+ ```python
13
+ from transformers import AutoTokenizer, GPT2Model
14
+
15
+ import torch.nn as nn
16
+ import torch
17
+
18
+ class RMSLayerNorm(nn.Module):
19
+ def __init__(self, normalized_shape, eps=1e-8, affine=True):
20
+ super(RMSLayerNorm, self).__init__()
21
+ self.normalized_shape = normalized_shape
22
+ self.eps = eps
23
+ self.affine = affine
24
+
25
+ if self.affine:
26
+ self.weight = nn.Parameter(torch.ones(()))
27
+ else:
28
+ self.register_parameter('weight', None)
29
+ self.register_parameter('bias', None)
30
+
31
+ def forward(self, x):
32
+ rms = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True) + self.eps)
33
+ x_normalized = x / rms
34
+ if self.affine:
35
+ x_normalized = x_normalized * self.weight
36
+ return x_normalized
37
+
38
+
39
+ def replace(model):
40
+ for name, child in model.named_children():
41
+ if isinstance(child, nn.modules.normalization.LayerNorm):
42
+ setattr(model, name, RMSLayerNorm(child.normalized_shape, eps=child.eps, affine=True))
43
+ else:
44
+ replace(child)
45
+ return model
46
+
47
+
48
+ class GPTR2Model(GPT2Model):
49
+ def __init__(self, config):
50
+ super().__init__(config)
51
+ replace(self)
52
+
53
+ model = GPTR2Model.from_pretrained("George-Ogden/gptr2-nano-with-momentum-without-weight-decay")
54
+ tokenizer = AutoTokenizer.from_pretrained("gpt2")
55
+ ```
56
+ For more details and example usage, see https://github.com/George-Ogden/residual-streams