Upload FlashSTU

Files changed (4) hide show

config.json CHANGED Viewed

@@ -11,9 +11,9 @@
   "dropout": 0.0,
   "mlp_scale": 12,
   "model_type": "FlashSTU",
-  "n_embd": 2304,
-  "n_heads": 9,
-  "n_layers": 7,
   "num_eigh": 24,
   "seq_len": 8192,
   "softcap": 50.0,

   "dropout": 0.0,
   "mlp_scale": 12,
   "model_type": "FlashSTU",
+  "n_embd": 256,
+  "n_heads": 2,
+  "n_layers": 2,
   "num_eigh": 24,
   "seq_len": 8192,
   "softcap": 50.0,

config.py CHANGED Viewed

@@ -1,5 +1,8 @@
 from transformers import PretrainedConfig
 class FlashSTUConfig(PretrainedConfig):
     model_type = "FlashSTU"
@@ -20,6 +23,7 @@ class FlashSTUConfig(PretrainedConfig):
         use_flash_fft: bool = True,
         use_approx: bool = True,
         softcap: float = 50.0,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -38,3 +42,4 @@ class FlashSTUConfig(PretrainedConfig):
         self.use_flash_fft = use_flash_fft
         self.use_approx = use_approx
         self.softcap = softcap

+import torch
 from transformers import PretrainedConfig
 class FlashSTUConfig(PretrainedConfig):
     model_type = "FlashSTU"
         use_flash_fft: bool = True,
         use_approx: bool = True,
         softcap: float = 50.0,
+        torch_dtype: torch.dtype = torch.bfloat16,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.use_flash_fft = use_flash_fft
         self.use_approx = use_approx
         self.softcap = softcap
+        self.torch_dtype = torch_dtype

model.py CHANGED Viewed

@@ -33,19 +33,20 @@ class Block(nn.Module):
     def __init__(self, config, phi, n) -> None:
         super(Block, self).__init__()
         # For more complex %-split arrangements, see https://arxiv.org/pdf/2406.07887
-        self.rn_1 = RMSNorm(config.n_embd)
         self.stu = STU(config, phi, n)
-        self.rn_2 = RMSNorm(config.n_embd)
         self.attn = Attention(config)
-        self.rn_3 = RMSNorm(config.n_embd)
         self.mlp = MLP(
             config.n_embd,
             config.n_embd * config.mlp_scale,
             activation=F.silu, # Use SwiGLU
             bias1=config.bias,
             bias2=config.bias,
-        ) if triton_mlp else MLP(config)
-        self.rn_4 = RMSNorm(config.n_embd)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = x + self.stu(self.rn_1(x))
@@ -84,7 +85,7 @@ class FlashSTU(PreTrainedModel):
                         for _ in range(self.n_layers)
                     ]
                 ),
-                rn_f=RMSNorm(config.n_embd)
             )
         )
         self.lm_head = nn.Linear(self.n_embd, self.vocab_size, bias=self.bias)

     def __init__(self, config, phi, n) -> None:
         super(Block, self).__init__()
         # For more complex %-split arrangements, see https://arxiv.org/pdf/2406.07887
+        self.rn_1 = RMSNorm(config.n_embd, dtype=config.torch_dtype)
         self.stu = STU(config, phi, n)
+        self.rn_2 = RMSNorm(config.n_embd, dtype=config.torch_dtype)
         self.attn = Attention(config)
+        self.rn_3 = RMSNorm(config.n_embd, dtype=config.torch_dtype)
         self.mlp = MLP(
             config.n_embd,
             config.n_embd * config.mlp_scale,
             activation=F.silu, # Use SwiGLU
             bias1=config.bias,
             bias2=config.bias,
+            dtype=config.torch_dtype,
+        ) if triton_mlp else MLP(config, dtype=config.torch_dtype)
+        self.rn_4 = RMSNorm(config.n_embd, dtype=config.torch_dtype)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = x + self.stu(self.rn_1(x))
                         for _ in range(self.n_layers)
                     ]
                 ),
+                rn_f=RMSNorm(config.n_embd, dtype=config.torch_dtype)
             )
         )
         self.lm_head = nn.Linear(self.n_embd, self.vocab_size, bias=self.bias)

model.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:95020e80f8dc983d77cbd36edbf46090d71628617c31e9fce4aedf6fbc79e74e
+size 420811608