Upload FlashSTU

Browse files

Files changed (6) hide show

config.json +21 -9
config.py +9 -5
model.py +76 -66
pytorch_model-00001-of-00002.bin +3 -0
pytorch_model-00002-of-00002.bin +3 -0
pytorch_model.bin.index.json +296 -0

config.json CHANGED Viewed

@@ -1,18 +1,30 @@
 {
   "architectures": [
-    "Transformer"
   ],
   "bias": false,
-  "bsz": 8,
   "dropout": 0.0,
-  "mlp_scale": 4,
-  "model_type": "llama",
-  "n_embd": 768,
-  "n_heads": 12,
-  "n_layers": 12,
-  "seq_len": 4096,
   "softcap": 50.0,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.44.0",
-  "vocab_size": 200064
 }

 {
   "architectures": [
+    "FlashSTU"
   ],
+  "auto_map": {
+    "AutoConfig": "config.FlashSTUConfig",
+    "AutoModel": "model.FlashSTU"
+  },
   "bias": false,
+  "bsz": 1,
   "dropout": 0.0,
+  "hidden_act": "swish",
+  "hidden_size": 1536,
+  "intermediate_size": 18432,
+  "model_type": "FlashSTU",
+  "n_embd": 1536,
+  "n_heads": 8,
+  "n_layers": 26,
+  "num_eigh": 24,
+  "seq_len": 8192,
   "softcap": 50.0,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.44.0",
+  "use_approx": true,
+  "use_attn": true,
+  "use_flash_fft": true,
+  "use_hankel_L": false,
+  "vocab_size": 200064,
+  "window_size": 1024
 }

config.py CHANGED Viewed

@@ -8,10 +8,10 @@ class FlashSTUConfig(PretrainedConfig):
     def __init__(
         self,
-        bsz: int = 4,
-        n_embd: int = 2304,
-        n_heads: int = 9,
-        n_layers: int = 7,
         seq_len: int = 8192,
         window_size: int = 1024,
         vocab_size: int = 200064,
@@ -22,6 +22,7 @@ class FlashSTUConfig(PretrainedConfig):
         use_hankel_L: bool = False,
         use_flash_fft: bool = True,
         use_approx: bool = True,
         softcap: float = 50.0,
         torch_dtype: torch.dtype = torch.bfloat16,
         **kwargs,
@@ -34,12 +35,15 @@ class FlashSTUConfig(PretrainedConfig):
         self.seq_len = seq_len
         self.window_size = window_size
         self.vocab_size = vocab_size
-        self.mlp_scale = mlp_scale
         self.bias = bias
         self.dropout = dropout
         self.num_eigh = num_eigh
         self.use_hankel_L = use_hankel_L
         self.use_flash_fft = use_flash_fft
         self.use_approx = use_approx
         self.softcap = softcap
         self.torch_dtype = torch_dtype

     def __init__(
         self,
+        bsz: int = 1,
+        n_embd: int = 1536,
+        n_heads: int = 8,
+        n_layers: int = 26,
         seq_len: int = 8192,
         window_size: int = 1024,
         vocab_size: int = 200064,
         use_hankel_L: bool = False,
         use_flash_fft: bool = True,
         use_approx: bool = True,
+        use_attn: bool = True,
         softcap: float = 50.0,
         torch_dtype: torch.dtype = torch.bfloat16,
         **kwargs,
         self.seq_len = seq_len
         self.window_size = window_size
         self.vocab_size = vocab_size
+        self.hidden_size = n_embd
+        self.intermediate_size = n_embd * mlp_scale
+        self.hidden_act = "swish"
         self.bias = bias
         self.dropout = dropout
         self.num_eigh = num_eigh
         self.use_hankel_L = use_hankel_L
         self.use_flash_fft = use_flash_fft
         self.use_approx = use_approx
+        self.use_attn = use_attn
         self.softcap = softcap
         self.torch_dtype = torch_dtype

model.py CHANGED Viewed

@@ -1,16 +1,15 @@
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from transformers import PreTrainedModel
 from stu import STU
-from modules import Attention
-from utils import get_spectral_filters, nearest_power_of_two
 from flash_stu.config import FlashSTUConfig
 try:
-    from flash_attn.modules.mlp import GatedMlp as MLP
     triton_mlp = True
 except ImportError as e:
     print(f"Unable to import Triton-based MLP: {e}. Falling back to vanilla SwiGLU MLP instead.")
@@ -18,95 +17,99 @@ except ImportError as e:
     triton_mlp = False
 try:
-    from flash_attn.ops.triton.layer_norm import RMSNorm
 except ImportError as e:
     print(f"Unable to import Triton-based RMSNorm: {e}. Falling back to PyTorch implementation.")
     from torch.nn import RMSNorm
-try:
-    from flash_attn.losses.cross_entropy import CrossEntropyLoss
-except ImportError as e:
-    print(f"Unable to import Triton-based cross entropy loss: {e}. Falling back to PyTorch implementation.")
-    from torch.nn import CrossEntropyLoss
-class Block(nn.Module):
-    def __init__(self, config, phi, n) -> None:
-        super(Block, self).__init__()
-        # For more complex %-split arrangements, see https://arxiv.org/pdf/2406.07887
-        self.rn_1 = RMSNorm(config.n_embd, dtype=config.torch_dtype)
         self.stu = STU(config, phi, n)
-        self.rn_2 = RMSNorm(config.n_embd, dtype=config.torch_dtype)
         self.attn = Attention(config)
-        self.rn_3 = RMSNorm(config.n_embd, dtype=config.torch_dtype)
-        self.mlp = MLP(
-            config.n_embd,
-            config.n_embd * config.mlp_scale,
-            activation=F.silu, # Use SwiGLU
-            bias1=config.bias,
-            bias2=config.bias,
-            dtype=config.torch_dtype,
-        ) if triton_mlp else MLP(config, dtype=config.torch_dtype)
-        self.rn_4 = RMSNorm(config.n_embd, dtype=config.torch_dtype)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = x + self.stu(self.rn_1(x))
-        x = x + self.mlp(self.rn_2(x))
-        x = x + self.attn(self.rn_3(x))
-        x = x + self.mlp(self.rn_4(x))
         return x
 class FlashSTU(PreTrainedModel):
     config_class = FlashSTUConfig
-    def __init__(self, config) -> None:
         super(FlashSTU, self).__init__(config)
-        self.config = config
         self.n_layers = config.n_layers
-        self.n_embd = config.n_embd
-        self.mlp_scale = config.mlp_scale
-        self.seq_len = config.seq_len
-        self.n = nearest_power_of_two(self.seq_len * 2 - 1, round_up=True)
-        self.vocab_size = config.vocab_size
-        self.K = config.num_eigh
-        self.use_hankel_L = config.use_hankel_L
-        self.phi = get_spectral_filters(self.seq_len, self.K, self.use_hankel_L)
         self.use_approx = config.use_approx
-        self.dropout = config.dropout
-        self.bias = config.bias
-        self.loss_fn = CrossEntropyLoss()
-        self.flash_stu = nn.ModuleDict(
-            dict(
-                tok_emb=nn.Embedding(self.vocab_size, self.n_embd, dtype=config.torch_dtype),
-                dropout=nn.Dropout(self.dropout),
-                hidden=nn.ModuleList(
-                    [
-                        Block(self.config, self.phi, self.n)
-                        for _ in range(self.n_layers)
-                    ]
-                ),
-                rn_f=RMSNorm(config.n_embd, dtype=config.torch_dtype)
-            )
-        )
-        self.lm_head = nn.Linear(self.n_embd, self.vocab_size, bias=self.bias, dtype=config.torch_dtype)
-        self.std = (self.n_embd) ** -0.5
         self.apply(self._init_weights)
         print("Model Parameter Count: %.2fM\n" % (self._get_num_params() / 1e6,))
     def forward(self, x: torch.Tensor) -> torch.tensor:
-        tok_emb = self.flash_stu.tok_emb(x)
-        x = self.flash_stu.dropout(tok_emb)
-        for block in self.flash_stu.hidden:
-            x = block(x)
-        x = self.flash_stu.rn_f(x)
         y_hat = self.lm_head(x)
         return y_hat
     def _get_num_params(self):
         n_params = sum(p.numel() for p in self.parameters())
         return n_params
     def _init_weights(self, module):
@@ -125,3 +128,10 @@ class FlashSTU(PreTrainedModel):
             else:
                 torch.nn.init.xavier_normal_(module.M_phi_plus)
                 torch.nn.init.xavier_normal_(module.M_phi_minus)

 import torch
 import torch.nn as nn
 from transformers import PreTrainedModel
 from stu import STU
+from modules_stu import Attention
+from utils import nearest_power_of_two
 from flash_stu.config import FlashSTUConfig
 try:
+    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP as TritonMLP
     triton_mlp = True
 except ImportError as e:
     print(f"Unable to import Triton-based MLP: {e}. Falling back to vanilla SwiGLU MLP instead.")
     triton_mlp = False
 try:
+    from liger_kernel.transformers.rms_norm import LigerRMSNorm as TritonNorm
+    triton_norm = True
 except ImportError as e:
     print(f"Unable to import Triton-based RMSNorm: {e}. Falling back to PyTorch implementation.")
     from torch.nn import RMSNorm
+    triton_norm = False
+class STULayer(nn.Module):
+    def __init__(self, config, phi, n):
+        super(STULayer, self).__init__()
+        self.stu_norm = TritonNorm(config.n_embd) if triton_norm else RMSNorm(config.n_embd, dtype=config.torch_dtype)
         self.stu = STU(config, phi, n)
+        self.mlp_norm = TritonNorm(config.n_embd) if triton_norm else RMSNorm(config.n_embd, dtype=config.torch_dtype)
+        self.mlp = TritonMLP(config) if triton_mlp else MLP(config, dtype=config.torch_dtype)
+        # TODO: Write Issue in Liger-Kernel repo to support user-defined dtype for MLP
+        self.stu_norm = self.stu_norm.to(dtype=config.torch_dtype)
+        self.mlp = self.mlp.to(dtype=config.torch_dtype)
+        self.mlp_norm = self.mlp_norm.to(dtype=config.torch_dtype)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.stu(self.stu_norm(x))
+        x = x + self.mlp(self.mlp_norm(x))
+        return x
+class AttentionLayer(nn.Module):
+    def __init__(self, config) -> None:
+        super(AttentionLayer, self).__init__()
+        self.attn_norm = TritonNorm(config.n_embd) if triton_norm else RMSNorm(config.n_embd, dtype=config.torch_dtype)
         self.attn = Attention(config)
+        self.mlp_norm = TritonNorm(config.n_embd) if triton_norm else RMSNorm(config.n_embd, dtype=config.torch_dtype)
+        self.mlp = TritonMLP(config) if triton_mlp else MLP(config, dtype=config.torch_dtype)
+        # TODO: Write Issue in Liger-Kernel repo to support user-defined dtype for MLP
+        self.attn_norm = self.attn_norm.to(dtype=config.torch_dtype)
+        self.mlp = self.mlp.to(dtype=config.torch_dtype)
+        self.mlp_norm = self.mlp_norm.to(dtype=config.torch_dtype)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.attn_norm(x))
+        x = x + self.mlp(self.mlp_norm(x))
         return x
 class FlashSTU(PreTrainedModel):
     config_class = FlashSTUConfig
+    def __init__(self, config, phi) -> None:
         super(FlashSTU, self).__init__(config)
         self.n_layers = config.n_layers
+        self.n = nearest_power_of_two(config.seq_len * 2 - 1, round_up=True)
+        self.phi = phi
         self.use_approx = config.use_approx
+        # TODO: Add support for Liger-Kernel Embedding once no longer experimental
+        self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd, dtype=config.torch_dtype)
+        self.dropout = nn.Dropout(config.dropout)
+        self.layers = nn.ModuleList()
+        for layer_idx in range(self.n_layers):
+            # For more complex %-split arrangements, see https://arxiv.org/pdf/2406.07887
+            if layer_idx % 2 == 0:
+                self.layers.append(STULayer(config, self.phi, self.n))
+            else:
+                self.layers.append(AttentionLayer(config) if config.use_attn else STULayer(config, self.phi, self.n))
+        self.norm = TritonNorm(config.n_embd) if triton_norm else RMSNorm(config.n_embd, dtype=config.torch_dtype)
+        # TODO: Write Issue in Liger-Kernel repo to support user-defined dtype for RMS Norm
+        self.norm = self.norm.to(dtype=config.torch_dtype)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=config.bias, dtype=config.torch_dtype)
+        self.tok_emb.weight = self.lm_head.weight
+        self.std = (config.n_embd) ** -0.5
         self.apply(self._init_weights)
         print("Model Parameter Count: %.2fM\n" % (self._get_num_params() / 1e6,))
     def forward(self, x: torch.Tensor) -> torch.tensor:
+        tok_emb = self.tok_emb(x)
+        x = self.dropout(tok_emb)
+        for layer in self.layers:
+            x = layer(x)
+        x = self.norm(x)
         y_hat = self.lm_head(x)
         return y_hat
     def _get_num_params(self):
         n_params = sum(p.numel() for p in self.parameters())
+        if hasattr(self, "pos_emb") and self.pos_emb is not None:
+            n_params -= self.pos_emb.weight.numel()
+        if self.tok_emb.weight is not self.lm_head.weight:
+            n_params -= self.tok_emb.weight.numel()
         return n_params
     def _init_weights(self, module):
             else:
                 torch.nn.init.xavier_normal_(module.M_phi_plus)
                 torch.nn.init.xavier_normal_(module.M_phi_minus)
+        elif isinstance(module, Attention):
+            torch.nn.init.xavier_normal_(module.c_attn.weight)
+            torch.nn.init.xavier_normal_(module.c_proj.weight)
+            if module.c_attn.bias is not None:
+                torch.nn.init.zeros_(module.c_attn.bias)
+            if module.c_proj.bias is not None:
+                torch.nn.init.zeros_(module.c_proj.bias)

pytorch_model-00001-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa79518498d1be3c786f3b9c85e1254573b0202d4510491213509f8c9dd6e466
+size 4982442909

pytorch_model-00002-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:995329e769de961be2233233b8544984c2de522c6b2112643d73cabb4503bb69
+size 358626608

pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,296 @@

+{
+  "metadata": {
+    "total_size": 5340972032
+  },
+  "weight_map": {
+    "layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.0.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.0.stu.M_filters": "pytorch_model-00001-of-00002.bin",
+    "layers.0.stu.M_inputs": "pytorch_model-00001-of-00002.bin",
+    "layers.0.stu.flash_fft.f_16_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.0.stu.flash_fft.f_16_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.0.stu.flash_fft.f_32_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.0.stu.flash_fft.f_32_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.0.stu.flash_fft.twiddle_factors_fft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.0.stu.flash_fft.twiddle_factors_fft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.0.stu.flash_fft.twiddle_factors_ifft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.0.stu.flash_fft.twiddle_factors_ifft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.0.stu_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.1.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.1.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.1.attn_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.1.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.10.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.10.stu.M_filters": "pytorch_model-00001-of-00002.bin",
+    "layers.10.stu.M_inputs": "pytorch_model-00001-of-00002.bin",
+    "layers.10.stu.flash_fft.f_16_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.10.stu.flash_fft.f_16_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.10.stu.flash_fft.f_32_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.10.stu.flash_fft.f_32_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.10.stu.flash_fft.twiddle_factors_fft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.10.stu.flash_fft.twiddle_factors_fft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.10.stu.flash_fft.twiddle_factors_ifft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.10.stu.flash_fft.twiddle_factors_ifft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.10.stu_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.11.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.11.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.11.attn_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.11.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.12.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.12.stu.M_filters": "pytorch_model-00001-of-00002.bin",
+    "layers.12.stu.M_inputs": "pytorch_model-00001-of-00002.bin",
+    "layers.12.stu.flash_fft.f_16_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.12.stu.flash_fft.f_16_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.12.stu.flash_fft.f_32_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.12.stu.flash_fft.f_32_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.12.stu.flash_fft.twiddle_factors_fft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.12.stu.flash_fft.twiddle_factors_fft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.12.stu.flash_fft.twiddle_factors_ifft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.12.stu.flash_fft.twiddle_factors_ifft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.12.stu_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.13.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.13.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.13.attn_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.13.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.14.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.14.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.14.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.14.stu.M_filters": "pytorch_model-00001-of-00002.bin",
+    "layers.14.stu.M_inputs": "pytorch_model-00001-of-00002.bin",
+    "layers.14.stu.flash_fft.f_16_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.14.stu.flash_fft.f_16_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.14.stu.flash_fft.f_32_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.14.stu.flash_fft.f_32_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.14.stu.flash_fft.twiddle_factors_fft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.14.stu.flash_fft.twiddle_factors_fft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.14.stu.flash_fft.twiddle_factors_ifft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.14.stu.flash_fft.twiddle_factors_ifft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.14.stu_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.15.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.15.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.15.attn_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.15.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.15.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.15.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.15.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.16.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.16.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.16.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.16.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.16.stu.M_filters": "pytorch_model-00001-of-00002.bin",
+    "layers.16.stu.M_inputs": "pytorch_model-00001-of-00002.bin",
+    "layers.16.stu.flash_fft.f_16_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.16.stu.flash_fft.f_16_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.16.stu.flash_fft.f_32_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.16.stu.flash_fft.f_32_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.16.stu.flash_fft.twiddle_factors_fft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.16.stu.flash_fft.twiddle_factors_fft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.16.stu.flash_fft.twiddle_factors_ifft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.16.stu.flash_fft.twiddle_factors_ifft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.16.stu_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.17.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.17.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.17.attn_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.17.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.17.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.17.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.17.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.18.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.18.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.18.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.18.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.18.stu.M_filters": "pytorch_model-00001-of-00002.bin",
+    "layers.18.stu.M_inputs": "pytorch_model-00001-of-00002.bin",
+    "layers.18.stu.flash_fft.f_16_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.18.stu.flash_fft.f_16_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.18.stu.flash_fft.f_32_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.18.stu.flash_fft.f_32_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.18.stu.flash_fft.twiddle_factors_fft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.18.stu.flash_fft.twiddle_factors_fft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.18.stu.flash_fft.twiddle_factors_ifft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.18.stu.flash_fft.twiddle_factors_ifft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.18.stu_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.19.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.19.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.19.attn_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.19.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.19.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.19.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.19.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.2.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.2.stu.M_filters": "pytorch_model-00001-of-00002.bin",
+    "layers.2.stu.M_inputs": "pytorch_model-00001-of-00002.bin",
+    "layers.2.stu.flash_fft.f_16_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.2.stu.flash_fft.f_16_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.2.stu.flash_fft.f_32_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.2.stu.flash_fft.f_32_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.2.stu.flash_fft.twiddle_factors_fft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.2.stu.flash_fft.twiddle_factors_fft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.2.stu.flash_fft.twiddle_factors_ifft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.2.stu.flash_fft.twiddle_factors_ifft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.2.stu_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.20.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.20.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.20.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.20.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.20.stu.M_filters": "pytorch_model-00001-of-00002.bin",
+    "layers.20.stu.M_inputs": "pytorch_model-00001-of-00002.bin",
+    "layers.20.stu.flash_fft.f_16_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.20.stu.flash_fft.f_16_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.20.stu.flash_fft.f_32_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.20.stu.flash_fft.f_32_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.20.stu.flash_fft.twiddle_factors_fft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.20.stu.flash_fft.twiddle_factors_fft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.20.stu.flash_fft.twiddle_factors_ifft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.20.stu.flash_fft.twiddle_factors_ifft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.20.stu_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.21.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.21.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.21.attn_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.21.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.21.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.21.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.21.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.22.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.22.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.22.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.22.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.22.stu.M_filters": "pytorch_model-00001-of-00002.bin",
+    "layers.22.stu.M_inputs": "pytorch_model-00001-of-00002.bin",
+    "layers.22.stu.flash_fft.f_16_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.22.stu.flash_fft.f_16_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.22.stu.flash_fft.f_32_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.22.stu.flash_fft.f_32_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.22.stu.flash_fft.twiddle_factors_fft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.22.stu.flash_fft.twiddle_factors_fft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.22.stu.flash_fft.twiddle_factors_ifft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.22.stu.flash_fft.twiddle_factors_ifft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.22.stu_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.23.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.23.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.23.attn_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.23.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.23.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.23.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.23.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.24.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "layers.24.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "layers.24.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "layers.24.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.24.stu.M_filters": "pytorch_model-00001-of-00002.bin",
+    "layers.24.stu.M_inputs": "pytorch_model-00001-of-00002.bin",
+    "layers.24.stu.flash_fft.f_16_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.24.stu.flash_fft.f_16_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.24.stu.flash_fft.f_32_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.24.stu.flash_fft.f_32_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.24.stu.flash_fft.twiddle_factors_fft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.24.stu.flash_fft.twiddle_factors_fft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.24.stu.flash_fft.twiddle_factors_ifft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.24.stu.flash_fft.twiddle_factors_ifft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.24.stu_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.25.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
+    "layers.25.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "layers.25.attn_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "layers.25.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "layers.25.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "layers.25.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "layers.25.mlp_norm.weight": "pytorch_model-00002-of-00002.bin",
+    "layers.3.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.3.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.3.attn_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.3.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.4.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.4.stu.M_filters": "pytorch_model-00001-of-00002.bin",
+    "layers.4.stu.M_inputs": "pytorch_model-00001-of-00002.bin",
+    "layers.4.stu.flash_fft.f_16_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.4.stu.flash_fft.f_16_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.4.stu.flash_fft.f_32_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.4.stu.flash_fft.f_32_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.4.stu.flash_fft.twiddle_factors_fft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.4.stu.flash_fft.twiddle_factors_fft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.4.stu.flash_fft.twiddle_factors_ifft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.4.stu.flash_fft.twiddle_factors_ifft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.4.stu_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.5.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.5.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.5.attn_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.5.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.6.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.6.stu.M_filters": "pytorch_model-00001-of-00002.bin",
+    "layers.6.stu.M_inputs": "pytorch_model-00001-of-00002.bin",
+    "layers.6.stu.flash_fft.f_16_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.6.stu.flash_fft.f_16_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.6.stu.flash_fft.f_32_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.6.stu.flash_fft.f_32_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.6.stu.flash_fft.twiddle_factors_fft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.6.stu.flash_fft.twiddle_factors_fft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.6.stu.flash_fft.twiddle_factors_ifft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.6.stu.flash_fft.twiddle_factors_ifft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.6.stu_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.7.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.7.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.7.attn_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.7.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.8.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.8.stu.M_filters": "pytorch_model-00001-of-00002.bin",
+    "layers.8.stu.M_inputs": "pytorch_model-00001-of-00002.bin",
+    "layers.8.stu.flash_fft.f_16_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.8.stu.flash_fft.f_16_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.8.stu.flash_fft.f_32_fft": "pytorch_model-00001-of-00002.bin",
+    "layers.8.stu.flash_fft.f_32_ifft": "pytorch_model-00001-of-00002.bin",
+    "layers.8.stu.flash_fft.twiddle_factors_fft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.8.stu.flash_fft.twiddle_factors_fft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.8.stu.flash_fft.twiddle_factors_ifft_16_1K": "pytorch_model-00001-of-00002.bin",
+    "layers.8.stu.flash_fft.twiddle_factors_ifft_32_32": "pytorch_model-00001-of-00002.bin",
+    "layers.8.stu_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.9.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.9.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.9.attn_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "layers.9.mlp_norm.weight": "pytorch_model-00001-of-00002.bin",
+    "lm_head.weight": "pytorch_model-00001-of-00002.bin",
+    "norm.weight": "pytorch_model-00002-of-00002.bin",
+    "tok_emb.weight": "pytorch_model-00001-of-00002.bin"
+  }
+}