transformer new

Browse files

Files changed (7) hide show

attn.py +66 -0
attn_masks.py +188 -0
attn_mods.py +127 -0
config.json +7 -11
configuration_minitransformer.py +18 -13
layers.py +11 -72
modeling_minitransformer.py +30 -42

attn.py CHANGED Viewed

@@ -123,3 +123,69 @@ class AttentionSDPA(nn.Module):
         y = self.resid_dropout(self.o_proj(y))
         return y

         y = self.resid_dropout(self.o_proj(y))
         return y
+class FlexAttention(nn.Module):
+    """
+    Generalized Multihead Attention and supports various attention masks.
+    Supports Rotary Positional Embeddings.
+    """
+    def __init__(self, config, mask_mod, score_mod=None):
+        """
+        Initializes the Attention class.
+        Args:
+            dim (int): Embedding size.
+            num_heads (int): Number of heads.
+            mask_mod (Callable): Mask to modify attention scores, e.g. causal.
+        """
+        super().__init__()
+        self.dim, self.num_heads = config.dim, config.num_heads
+        assert config.dim % config.num_heads == 0, f"dim ({self.dim}) must be divisible num_heads ({self.num_heads})"
+        self.head_dim = config.dim // config.num_heads
+        self.wq = nn.Linear(config.dim, config.dim)
+        self.wk = nn.Linear(config.dim, config.dim)
+        self.wv = nn.Linear(config.dim, config.dim)
+        self.mask_mod = mask_mod
+        self.score_mod = score_mod
+        self.block_mask = create_block_mask(
+            mask_mod=self.mask_mod,
+            B=None, # Broadcast
+            H=None, # Broadcast
+            Q_LEN=config.seq_len,
+            KV_LEN=config.seq_len,
+            device=config.device,
+        )
+        self.o_proj = nn.Linear(config.dim, config.dim)
+        self.o_proj.SCALE_INIT = 1
+    def forward(
+        self,
+        x: torch.Tensor = None,
+        q: torch.Tensor = None,
+        k: torch.Tensor = None,
+        v: torch.Tensor = None,
+        freqs_cis: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if x is not None:
+            q = k = v = x
+        if any(t is None for t in [q, k, v]):
+            raise ValueError("Must provide either x for self-attention or q/k/v for cross-attention.")
+        bsz, q_len, _ = q.shape
+        _, k_len, _ = k.shape
+        _, v_len, _ = v.shape
+        Q = self.wq(q).reshape(bsz, self.num_heads, q_len, self.head_dim)
+        K = self.wk(k).reshape(bsz, self.num_heads, k_len, self.head_dim)
+        V = self.wv(v).reshape(bsz, self.num_heads, v_len, self.head_dim)
+        Q, K = apply_rotary_emb(Q, K, freqs_cis=freqs_cis)
+        output = flex_attention(Q, K, V, block_mask=self.block_mask, score_mod=self.score_mod)
+        output = output.reshape(bsz, q_len, self.dim)
+        output = self.o_proj(output)
+        return output

attn_masks.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import torch
+from torch.nn.attention.flex_attention import _mask_mod_signature
+def causal_mask(
+    batch_size: int,
+    num_heads: int,
+    q_idx: torch.Tensor,
+    kv_idx: torch.Tensor
+) -> torch.Tensor:
+    """
+    Returns a boolean tensor indicating which positions in the attention matrix
+    are valid for causal (autoregressive) attention. By default, it's True for
+    positions (i, j) where i >= j.
+    Args:
+        batch_size (int): Batch size (unused here).
+        num_heads (int): Number of heads (unused here).
+        q_idx (torch.Tensor): Tensor indexing the query positions.
+        kv_idx (torch.Tensor): Tensor indexing the key/value positions.
+    Returns:
+        torch.Tensor: A boolean tensor where True indicates that the query at
+        position i can attend to the key at position j, respecting i >= j.
+    """
+    return q_idx >= kv_idx
+def generate_sliding_window_mask(window_size: int, causal: bool = True) -> _mask_mod_signature:
+    """
+    Creates a sliding window mask function.
+    If `causal=True`, each query token at position i can attend only to tokens j
+    in [i - window_size, i].
+    If `causal=False`, each query token i can attend to any token j in
+    [i - window_size, i + window_size], i.e. a symmetric window of size `window_size`.
+    Args:
+        window_size (int): The maximum distance from i that i can attend to.
+        causal (bool): Whether to enforce causal ordering (i >= j). Defaults to True.
+    Returns:
+        _mask_mod_signature: A callable mask function that takes
+        (batch_size, num_heads, q_idx, kv_idx) and returns a boolean tensor
+        indicating allowed attention connections.
+    """
+    def sliding_window_mask(
+        batch_size: int,
+        num_heads: int,
+        q_idx: torch.Tensor,
+        kv_idx: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        If causal is True:
+            within_window = (q_idx - kv_idx) <= window_size, and q_idx >= kv_idx.
+        If causal is False:
+            within_window = abs(q_idx - kv_idx) <= window_size.
+        """
+        if causal:
+            # standard "look back" window
+            distance = q_idx - kv_idx
+            within_window = (distance >= 0) & (distance <= window_size)
+        else:
+            # symmetrical window around i
+            distance = (q_idx - kv_idx).abs()
+            within_window = distance <= window_size
+        return within_window
+    name_ext = "causal" if causal else "noncausal"
+    sliding_window_mask.__name__ = f"sliding_window_{window_size}_{name_ext}"
+    return sliding_window_mask
+def generate_dilated_sliding_window_mask(
+    window_size: int,
+    dilation: int = 2,
+    causal: bool = True
+) -> _mask_mod_signature:
+    """
+    Creates a dilated sliding window mask function.
+    If `causal=True`, each query token i can attend tokens j in [i - window_size, i]
+    such that (i - j) % dilation == 0.
+    If `causal=False`, each query token i can attend tokens j in [i - window_size,
+    i + window_size] for which |i - j| % dilation == 0.
+    Args:
+        window_size (int): The maximum distance from i to j (backwards if causal=True,
+                           otherwise symmetric around i).
+        dilation (int): The stride for skipping positions.
+        causal (bool): Whether to enforce causal ordering (i >= j). Defaults to True.
+    Returns:
+        _mask_mod_signature: A callable mask function that takes
+        (batch_size, num_heads, q_idx, kv_idx) and returns a boolean tensor
+        indicating allowed attention connections.
+    """
+    def dilated_sliding_window_mask(
+        batch_size: int,
+        num_heads: int,
+        q_idx: torch.Tensor,
+        kv_idx: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        If causal is True:
+            distance = q_idx - kv_idx
+            0 <= distance <= window_size and distance % dilation == 0.
+        If causal is False:
+            distance = (q_idx - kv_idx).abs()
+            distance <= window_size and distance % dilation == 0.
+        """
+        if causal:
+            distance = q_idx - kv_idx
+            within_window = (distance >= 0) & (distance <= window_size)
+        else:
+            distance = (q_idx - kv_idx).abs()
+            within_window = distance <= window_size
+        meets_dilation = (distance % dilation) == 0
+        return within_window & meets_dilation
+    mode_str = "causal" if causal else "noncausal"
+    dilated_sliding_window_mask.__name__ = (
+        f"dilated_sliding_window_{window_size}_dilation_{dilation}_{mode_str}"
+    )
+    return dilated_sliding_window_mask
+def main():
+    """
+    Demonstrates usage of each mask by printing attention grids. We include a few
+    basic checks to ensure the masks behave as expected. We show both the causal
+    and non-causal versions for the sliding window and dilated masks.
+    """
+    B, H = 1, 1
+    Q_LEN, KV_LEN = 8, 8
+    # coordinate grids
+    q_idx = torch.arange(Q_LEN).unsqueeze(-1).expand(Q_LEN, KV_LEN)
+    kv_idx = torch.arange(KV_LEN).unsqueeze(0).expand(Q_LEN, KV_LEN)
+    print("= Causal Mask =")
+    c_mask = causal_mask(B, H, q_idx, kv_idx)
+    print(c_mask.int(), "\n")
+    print("= Sliding Window (window_size=2, causal=True) =")
+    sw_causal_fn = generate_sliding_window_mask(window_size=2, causal=True)
+    sw_causal = sw_causal_fn(B, H, q_idx, kv_idx)
+    print(sw_causal.int(), "\n")
+    print("= Sliding Window (window_size=2, causal=False) =")
+    sw_noncausal_fn = generate_sliding_window_mask(window_size=2, causal=False)
+    sw_noncausal = sw_noncausal_fn(B, H, q_idx, kv_idx)
+    print(sw_noncausal.int(), "\n")
+    print("= Dilated Sliding Window (window_size=4, dilation=2, causal=True) =")
+    ds_causal_fn = generate_dilated_sliding_window_mask(window_size=4, dilation=2, causal=True)
+    ds_causal = ds_causal_fn(B, H, q_idx, kv_idx)
+    print(ds_causal.int(), "\n")
+    print("= Dilated Sliding Window (window_size=4, dilation=2, causal=False) =")
+    ds_noncausal_fn = generate_dilated_sliding_window_mask(window_size=4, dilation=2, causal=False)
+    ds_noncausal = ds_noncausal_fn(B, H, q_idx, kv_idx)
+    print(ds_noncausal.int(), "\n")
+    # Quick checks:
+    # (1) Causal means no i < j
+    assert torch.all(c_mask == (q_idx >= kv_idx)), "Causal mask mismatch!"
+    # (2) For windowed masks with causal=True, check a random row
+    i = 5
+    row_sw = sw_causal[i]
+    allowed_js = torch.where(row_sw)[0]
+    if len(allowed_js) > 0:
+        # difference i-j <= 2
+        assert (i - allowed_js.min()) <= 2, "Window mismatch for sliding_window_mask(causal=True)."
+    # (3) Dilated mask with causal=True should skip every other position if dilation=2
+    i = 6
+    row_ds = ds_causal[i]
+    allowed_js = torch.where(row_ds)[0]
+    for j in allowed_js:
+        diff = i - j
+        assert diff % 2 == 0, f"Dilation mismatch: got diff={diff}."
+    print("All checks passed.")
+if __name__ == "__main__":
+    main()

attn_mods.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import torch
+from torch import Tensor
+from torch.nn.attention.flex_attention import _score_mod_signature
+from torch._inductor.lowering import make_pointwise, register_lowering
+# Some internal torch.compile details
+from torch._inductor.virtualized import ops
+from functools import partial
+@torch.library.custom_op("approx::tanh", mutates_args=())
+def _tanh_approx(inp: Tensor) -> Tensor:
+    return torch.tanh(inp)
+@_tanh_approx.register_fake
+def _(inp: torch.Tensor) -> torch.Tensor:
+    return torch.tanh(inp)
+def _tanh_approx_lowering(inp):
+    fn = partial(ops.inline_asm_elementwise, asm="tanh.approx.f32 $0, $1;")
+    return make_pointwise(fn)(inp)
+register_lowering(torch.ops.approx.tanh)(_tanh_approx_lowering)
+class _TanhApprox(torch.autograd.Function):
+    @staticmethod
+    def forward(x):
+        return torch.ops.approx.tanh(x)
+    @staticmethod
+    def setup_context(ctx, inputs, output):
+        (x,) = inputs
+        result = output
+        ctx.save_for_backward(result)
+    @staticmethod
+    def backward(ctx, grad_output):
+        (result,) = ctx.saved_tensors
+        return grad_output * (1 - result * result)
+    @staticmethod
+    def vmap(info, in_dims, x):
+        return torch.tanh(x), 0
+_tanh_approx = _TanhApprox.apply
+def generate_tanh_softcap(soft_cap: int, approx: bool = False) -> _score_mod_signature:
+    """Returns an tanh bias score_mod given the number of heads H
+    Args:
+        soft_cap: The soft cap value to use for normalizing logits
+        approx: Whether to use the `tanh.approx.` ptx instruction
+    Returns:
+        tanh_softcap: score_mod
+    """
+    tanh = _tanh_approx if approx else torch.tanh
+    def tanh_softcap(score, b, h, q_idx, kv_idx):
+        return soft_cap * tanh(score / soft_cap)
+    prefix = "tanh_softcap_approx" if approx else "tanh_softcap"
+    tanh_softcap.__name__ = f"{prefix}_{soft_cap}"
+    return tanh_softcap
+def generate_alibi_bias(H: int) -> _score_mod_signature:
+    """Returns an alibi bias score_mod given the number of heads H
+    Args:
+        H: number of heads
+    Returns:
+        alibi_bias: alibi bias score_mod
+    """
+    def alibi_mod(score, b, h, q_idx, kv_idx):
+        scale = torch.exp2(-((h + 1) * 8.0 / H))
+        bias = (kv_idx - q_idx) * scale
+        return score + bias
+    return alibi_mod
+def generate_tanh_softcap_alibi(H: int, soft_cap: float, approx: bool = False) -> _score_mod_signature:
+    """Returns a combined ALiBi and tanh softcapping score_mod.
+    Args:
+        H (int): number of heads for ALiBi scaling
+        soft_cap (float): the soft cap value for normalizing/logit clipping
+        approx (bool): Whether to use the 'tanh.approx' PTX-based approximation
+    Returns:
+        A combined score_mod function that first applies ALiBi,
+        then performs softcap + tanh (optionally approximate).
+    """
+    tanh_func = _tanh_approx if approx else torch.tanh
+    def alibi_tanh_softcap(score, b, h, q_idx, kv_idx):
+        # Compute ALiBi bias
+        scale = torch.exp2(-((h + 1) * 8.0 / H))
+        bias = (kv_idx - q_idx) * scale
+        score = score + bias
+        # Apply softcap
+        score = score / soft_cap
+        # Apply tanh
+        score = tanh_func(score)
+        # Rescale by soft_cap
+        score = score * soft_cap
+        return score
+    # Give the score_mod a unique name:
+    if approx:
+        alibi_tanh_softcap.__name__ = f"tanh_softcap_alibi_approx_{soft_cap}"
+    else:
+        alibi_tanh_softcap.__name__ = f"tanh_softcap_alibi_{soft_cap}"
+    return alibi_tanh_softcap

config.json CHANGED Viewed

@@ -2,17 +2,15 @@
   "model_type": "minitransformer",
   "_name_or_path": "Transformer_500M",
   "architectures": ["MiniTransformer"],
-  "n_embd": 768,
-  "n_heads": 12,
-  "n_layers": 27,
   "seq_len": 8192,
   "window_size": 8192,
   "vocab_size": 200064,
   "mlp_scale": 4,
   "bias": false,
   "dropout": 0.0,
-  "num_eigh": 24,
-  "use_hankel_L": false,
   "num_epochs": 1,
   "global_bsz": 524288,
   "bsz": 1,
@@ -27,7 +25,7 @@
   "ddp": true,
   "mixed_precision": true,
   "torch_dtype": "bfloat16",
-  "use_cpu_offload": false,
   "sharding_strategy": "full_shard",
   "state_dict_type": "full",
   "auto_wrap_policy": "partial",
@@ -42,12 +40,10 @@
     "buffer": "bfloat16"
   },
   "fsdp_modules": [
-    "Attention"
   ],
   "use_activation_checkpointing": true,
-  "use_flash_fft": true,
-  "use_approx": true,
-  "use_attn": true,
   "softcap": 50.0,
-  "torch_compile": false
 }

   "model_type": "minitransformer",
   "_name_or_path": "Transformer_500M",
   "architectures": ["MiniTransformer"],
+  "dim": 768,
+  "num_heads": 24,
+  "num_layers": 27,
   "seq_len": 8192,
   "window_size": 8192,
   "vocab_size": 200064,
   "mlp_scale": 4,
   "bias": false,
   "dropout": 0.0,
   "num_epochs": 1,
   "global_bsz": 524288,
   "bsz": 1,
   "ddp": true,
   "mixed_precision": true,
   "torch_dtype": "bfloat16",
+  "cpu_offload": false,
   "sharding_strategy": "full_shard",
   "state_dict_type": "full",
   "auto_wrap_policy": "partial",
     "buffer": "bfloat16"
   },
   "fsdp_modules": [
+    "AttentionLayer"
   ],
   "use_activation_checkpointing": true,
   "softcap": 50.0,
+  "theta": 10000.0,
+  "torch_compile": true
 }

configuration_minitransformer.py CHANGED Viewed

@@ -7,33 +7,38 @@ class MiniTransformerConfig(PretrainedConfig):
     def __init__(
         self,
         bsz: int = 1,
-        n_embd: int = 768,
-        n_heads: int = 12,
-        n_layers: int = 27,
         seq_len: int = 8192,
-        window_size: int = 8192,
         vocab_size: int = 200064,
-        mlp_scale: int = 4,
         bias: bool = False,
         dropout: float = 0.0,
         softcap: float = 50.0,
-        torch_dtype = torch.bfloat16,
-        device: str = None,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.bsz = bsz
-        self.n_embd = n_embd
-        self.n_heads = n_heads
-        self.n_layers = n_layers
         self.seq_len = seq_len
         self.window_size = window_size
         self.vocab_size = vocab_size
-        self.hidden_size = n_embd
-        self.intermediate_size = n_embd * mlp_scale
-        self.hidden_act = "swish"
         self.bias = bias
         self.dropout = dropout
         self.softcap = softcap
         self.torch_dtype = torch_dtype
         self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')  # Store as string

     def __init__(
         self,
         bsz: int = 1,
+        dim: int = 1536,
+        num_heads: int = 8,
+        num_layers: int = 26,
         seq_len: int = 8192,
+        window_size: int = 1024,
         vocab_size: int = 200064,
+        mlp_scale: int = 12,
         bias: bool = False,
         dropout: float = 0.0,
         softcap: float = 50.0,
+        theta: float = 10_000.0,
+        use_alibi: bool = False,
+        torch_dtype: torch.dtype = torch.bfloat16,
+        device: torch.device = None,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.bsz = bsz
+        self.dim = dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
         self.seq_len = seq_len
         self.window_size = window_size
         self.vocab_size = vocab_size
+        self.hidden_size = dim
+        self.mlp_scale = mlp_scale
+        self.intermediate_size = self.dim * self.mlp_scale
         self.bias = bias
         self.dropout = dropout
         self.softcap = softcap
+        self.theta = theta
+        self.use_alibi = use_alibi
         self.torch_dtype = torch_dtype
         self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')  # Store as string

layers.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
-from .modules import STU
 from .modules import MLP
 from .modules import Attention
 try:
@@ -23,80 +23,19 @@ except ImportError as e:
     from torch.nn import RMSNorm
     triton_norm = False
-class STULayer(nn.Module):
-    def __init__(self, config, phi, n):
-        super(STULayer, self).__init__()
-        if isinstance(config.torch_dtype, str):
-            torch_dtype = getattr(torch, config.torch_dtype)
-        else:
-            torch_dtype = config.torch_dtype
-        self.stu_norm = (
-            TritonNorm(config.n_embd)
-            if triton_norm
-            else RMSNorm(config.n_embd, dtype=torch_dtype)
-        )
-        self.stu = STU(config, phi, n)
-        self.stu = self.stu.to(dtype=torch_dtype)
-        self.mlp_norm = (
-            TritonNorm(config.n_embd)
-            if triton_norm
-            else RMSNorm(config.n_embd, dtype=torch_dtype)
-        )
-        self.mlp = (
-            TritonMLP(config) if triton_mlp else MLP(config, dtype=torch_dtype)
-        )
-        # TODO: Write Issue in Liger-Kernel repo to support user-defined dtype for MLP
-        self.stu_norm = self.stu_norm.to(dtype=torch_dtype)
-        self.mlp = self.mlp.to(dtype=torch_dtype)
-        self.mlp_norm = self.mlp_norm.to(dtype=torch_dtype)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # Debug dtype
-        # Normalize and apply STU
-        x_normed = self.stu_norm(x).to(dtype=self.stu.M_inputs.dtype)  # Match dtype for STU
-        x_stu = self.stu(x_normed).to(dtype=x.dtype)  # Ensure output matches `x`'s dtype
-        x = x + x_stu
-        # Normalize and apply MLP
-        x_normed_mlp = self.mlp_norm(x).to(dtype=self.mlp.gate_proj.weight.dtype)  # Match dtype for MLP
-        x_mlp = self.mlp(x_normed_mlp).to(dtype=x.dtype)  # Ensure output matches `x`'s dtype
-        x = x + x_mlp
-        return x
 class AttentionLayer(nn.Module):
-    def __init__(self, config) -> None:
         super(AttentionLayer, self).__init__()
-        if isinstance(config.torch_dtype, str):
-            torch_dtype = getattr(torch, config.torch_dtype)
-        else:
-            torch_dtype = config.torch_dtype
-        self.attn_norm = (
-            TritonNorm(config.n_embd)
-            if triton_norm
-            else RMSNorm(config.n_embd, dtype=torch_dtype)
-        )
-        self.attn = Attention(config)
-        self.attn = self.attn.to(dtype=torch_dtype)
-        self.mlp_norm = (
-            TritonNorm(config.n_embd)
-            if triton_norm
-            else RMSNorm(config.n_embd, dtype=torch_dtype)
         )
-        self.mlp = (
-            TritonMLP(config) if triton_mlp else MLP(config, dtype=torch_dtype)
-        )
-        self.mlp = self.mlp.to(dtype=torch_dtype)
-        # TODO: Write Issue in Liger-Kernel repo to support user-defined dtype for MLP
-        self.attn_norm = self.attn_norm.to(dtype=torch_dtype)
-        self.mlp = self.mlp.to(dtype=torch_dtype)
-        self.mlp_norm = self.mlp_norm.to(dtype=torch_dtype)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = x + self.attn(self.attn_norm(x))
         x = x + self.mlp(self.mlp_norm(x))
         return x

 import torch
 import torch.nn as nn
+from .attn import FlexAttention
 from .modules import MLP
 from .modules import Attention
 try:
     from torch.nn import RMSNorm
     triton_norm = False
 class AttentionLayer(nn.Module):
+    def __init__(self, config, mask_mod, score_mod=None) -> None:
         super(AttentionLayer, self).__init__()
+        self.attn_norm = nn.RMSNorm(config.dim)
+        self.attn = FlexAttention(
+            config=config,
+            mask_mod=mask_mod,
+            score_mod=score_mod,
         )
+        self.mlp_norm = nn.RMSNorm(config.dim)
+        self.mlp = MLP(config)
+    def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor=None) -> torch.Tensor:
+        x = x + self.attn(self.attn_norm(x), freqs_cis=freqs_cis)
         x = x + self.mlp(self.mlp_norm(x))
         return x

modeling_minitransformer.py CHANGED Viewed

@@ -10,6 +10,10 @@ from .utils import nearest_power_of_two
 from .layers import AttentionLayer
 from .configuration_minitransformer import MiniTransformerConfig
 try:
     from liger_kernel.transformers.rms_norm import LigerRMSNorm as TritonNorm
     triton_norm = True
@@ -33,39 +37,31 @@ class MiniTransformer(PreTrainedModel):
     def __init__(self, config) -> None:
         super(MiniTransformer, self).__init__(config)
-        self.n_layers = config.n_layers
-        self.n = nearest_power_of_two(config.seq_len * 2 - 1, round_up=True)
-        if isinstance(config.torch_dtype, torch.dtype):
-            torch_dtype = config.torch_dtype
-        else:
-            torch_dtype = getattr(torch, config.torch_dtype)
-        device = torch.device(config.device)
-        # TODO: Add support for Liger-Kernel Embedding once no longer experimental
-        self.tok_emb = nn.Embedding(
-            config.vocab_size, config.n_embd, dtype=config.torch_dtype
-        )
         self.dropout = nn.Dropout(config.dropout)
         self.layers = nn.ModuleList()
-        for _ in range(self.n_layers):
-            self.layers.append(AttentionLayer(config))
-        self.norm = (
-            TritonNorm(config.n_embd)
-            if triton_norm
-            else RMSNorm(config.n_embd, dtype=config.torch_dtype)
-        )
-        # TODO: Write Issue in Liger-Kernel repo to support user-defined dtype for RMS Norm
-        self.norm = self.norm.to(dtype=config.torch_dtype)
-        self.lm_head = nn.Linear(
-            config.n_embd, config.vocab_size, bias=config.bias, dtype=config.torch_dtype
-        )
-        self.tok_emb.weight = self.lm_head.weight
-        self.std = (config.n_embd) ** -0.5
         self.apply(self._init_weights)
         print("Model Parameter Count: %.2fM\n" % (self._get_num_params() / 1e6,))
@@ -77,15 +73,13 @@ class MiniTransformer(PreTrainedModel):
     ) -> CausalLMOutput:
         # Compute embeddings
         tok_emb = self.tok_emb(input_ids)
-        x = self.dropout(tok_emb)
-        # Pass through layers
         for layer in self.layers:
-            x = layer(x)
         # Normalize and project to vocabulary
-        x = self.norm(x)
-        logits = self.lm_head(x)
         loss = None
         if labels is not None:
@@ -107,26 +101,20 @@ class MiniTransformer(PreTrainedModel):
         n_params = sum(p.numel() for p in self.parameters())
         if hasattr(self, "pos_emb") and self.pos_emb is not None:
             n_params -= self.pos_emb.weight.numel()
-        if self.tok_emb.weight is not self.lm_head.weight:
             n_params -= self.tok_emb.weight.numel()
         return n_params
     def _init_weights(self, module):
         if isinstance(module, nn.Linear):
             if hasattr(module, "SCALE_INIT"):
-                self.std *= (2 * self.n_layers) ** -0.5
             torch.nn.init.normal_(module.weight, mean=0.0, std=self.std)
             if module.bias is not None:
                 torch.nn.init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
             torch.nn.init.normal_(module.weight, mean=0.0, std=self.std)
-        elif isinstance(module, Attention):
-            torch.nn.init.xavier_normal_(module.attn.weight)
-            torch.nn.init.xavier_normal_(module.o_proj.weight)
-            if module.attn.bias is not None:
-                torch.nn.init.zeros_(module.attn.bias)
-            if module.o_proj.bias is not None:
-                torch.nn.init.zeros_(module.o_proj.bias)
     @staticmethod
     def top_k_top_p_filtering(
         logits: torch.Tensor,

 from .layers import AttentionLayer
 from .configuration_minitransformer import MiniTransformerConfig
+from .attn_masks import causal_mask
+from .attn_mods import generate_tanh_softcap
+from .rotary_emb import precompute_freqs_cis
 try:
     from liger_kernel.transformers.rms_norm import LigerRMSNorm as TritonNorm
     triton_norm = True
     def __init__(self, config) -> None:
         super(MiniTransformer, self).__init__(config)
+        self.num_layers = config.num_layers
+        assert config.dim % config.num_heads == 0, f"dim ({self.dim}) must be divisible num_heads ({self.num_heads})"
+        self.head_dim = config.dim // config.num_heads
+        logit_softcap = generate_tanh_softcap(soft_cap=config.softcap)
+        # From pytorch/pytorch#123411, we set persistent=True for torch.compile and PP compatibility
+        self.register_buffer("freqs_cis", precompute_freqs_cis(
+            head_dim=self.head_dim,
+            max_seq_len=config.seq_len,
+            theta=config.theta,
+        ), persistent=True)
+        self.tok_emb = nn.Embedding(config.vocab_size, config.dim)
         self.dropout = nn.Dropout(config.dropout)
         self.layers = nn.ModuleList()
+        for _ in range(self.num_layers):
+            layer = AttentionLayer(config, mask_mod=causal_mask, score_mod=logit_softcap)
+            self.layers.append(layer)
+        self.norm = nn.RMSNorm(config.dim)
+        self.lm_head = nn.Linear(config.dim, config.vocab_size, bias=config.bias)
+        # self.tok_emb.weight = self.lm_head.weight
+        self.std = (config.dim) ** -0.5
         self.apply(self._init_weights)
         print("Model Parameter Count: %.2fM\n" % (self._get_num_params() / 1e6,))
     ) -> CausalLMOutput:
         # Compute embeddings
         tok_emb = self.tok_emb(input_ids)
         for layer in self.layers:
+            tok_emb = layer(tok_emb, self.freqs_cis)
         # Normalize and project to vocabulary
+        tok_emb = self.norm(tok_emb)
+        logits = self.lm_head(tok_emb)
         loss = None
         if labels is not None:
         n_params = sum(p.numel() for p in self.parameters())
         if hasattr(self, "pos_emb") and self.pos_emb is not None:
             n_params -= self.pos_emb.weight.numel()
+        if self.tok_emb.weight is self.lm_head.weight:
             n_params -= self.tok_emb.weight.numel()
         return n_params
     def _init_weights(self, module):
         if isinstance(module, nn.Linear):
             if hasattr(module, "SCALE_INIT"):
+                self.std *= (2 * self.num_layers) ** -0.5
             torch.nn.init.normal_(module.weight, mean=0.0, std=self.std)
             if module.bias is not None:
                 torch.nn.init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
             torch.nn.init.normal_(module.weight, mean=0.0, std=self.std)
     @staticmethod
     def top_k_top_p_filtering(
         logits: torch.Tensor,