--- license: mit datasets: - wikipedia language: - en tags: - research --- This model is significantly undertrained and designed for research purposes only. For use in transformers: ```python from transformers import AutoTokenizer, GPT2Model import torch.nn as nn import torch class RMSLayerNorm(nn.Module): def __init__(self, normalized_shape, eps=1e-8, affine=True): super(RMSLayerNorm, self).__init__() self.normalized_shape = normalized_shape self.eps = eps self.affine = affine if self.affine: self.weight = nn.Parameter(torch.ones(())) else: self.register_parameter('weight', None) self.register_parameter('bias', None) def forward(self, x): rms = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True) + self.eps) x_normalized = x / rms if self.affine: x_normalized = x_normalized * self.weight return x_normalized def replace(model): for name, child in model.named_children(): if isinstance(child, nn.modules.normalization.LayerNorm): setattr(model, name, RMSLayerNorm(child.normalized_shape, eps=child.eps, affine=True)) else: replace(child) return model class GPTR2Model(GPT2Model): def __init__(self, config): super().__init__(config) replace(self) model = GPTR2Model.from_pretrained("George-Ogden/gptr2-nano-with-momentum-with-weight-decay") tokenizer = AutoTokenizer.from_pretrained("gpt2") ``` For more details and example usage, see https://github.com/George-Ogden/residual-streams