Blackroot commited on 18 days ago

Commit

6aced58

verified ·

1 Parent(s): f2c8e64

Upload 18 files

Browse files

Files changed (18) hide show

epoch3.pt +3 -0
inference.py +54 -0
llama_modeling/__init__.py +0 -0
llama_modeling/attention.py +106 -0
llama_modeling/config.py +15 -0
llama_modeling/decoder.py +42 -0
llama_modeling/diff_attn.py +150 -0
llama_modeling/extact.py +25 -0
llama_modeling/front_end.py +85 -0
llama_modeling/liger_rope.py +258 -0
llama_modeling/mlp.py +18 -0
llama_modeling/model.py +35 -0
llama_modeling/rms_norm.py +16 -0
llama_modeling/rope.py +34 -0
llama_modeling/tensor_prod_attn.py +147 -0
test-train.py +238 -0
utils/__init__.py +0 -0
utils/trainutils.py +92 -0

epoch3.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:849af991539dcc0d5b278fb81e96e2e72d005b5d671b024e18573c30ea51f676
+size 507810914

inference.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import torch
+from transformers import AutoTokenizer
+from llama_modeling.front_end import LlamaForCausalLM
+from llama_modeling.config import LlamaConfig
+import json
+import sys
+from utils.trainutils import load_checkpoint
+def generate_text(model, tokenizer, prompt, max_new_tokens=30):
+    input_ids = tokenizer.encode(prompt, return_tensors='pt').to("cuda")
+    with torch.inference_mode():
+        outputs = model.generate(
+            input_ids,
+            max_new_tokens=max_new_tokens,
+            temperature=0.7
+        )
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: python inference.py <path_to_model>")
+        sys.exit(1)
+    model_path = sys.argv[1]
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    with open("config.json") as f:
+        config_dict = json.load(f)
+    config = LlamaConfig(**{k: v for k, v in config_dict.items() if k in LlamaConfig.__dataclass_fields__})
+    model = LlamaForCausalLM(config).to(device)
+    load_checkpoint(model, model_path)
+    model.eval()
+    tokenizer = AutoTokenizer.from_pretrained("./SmolLM2-135M-Instruct")
+    prompts = [
+        "Once upon a time,",
+        "The best way to learn programming is",
+        "Here's a recipe for chocolate cake:"
+    ]
+    with torch.no_grad(), torch.autocast(device_type='cuda', dtype=None):
+        for prompt in prompts:
+            print(f"\nPrompt: {prompt}")
+            output = generate_text(model, tokenizer, prompt)
+            print(f"Generated: {output}")
+            print("-" * 50)
+if __name__ == "__main__":
+    main()

llama_modeling/__init__.py ADDED Viewed

File without changes

llama_modeling/attention.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from flash_attn import flash_attn_func
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from .liger_rope import LigerRopeFunction
+from .config import LlamaConfig
+class LlamaAttention(nn.Module):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_attention_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_attention_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.register_buffer(
+            "cos_cached",
+            self._compute_rope_embeddings(
+                self.max_position_embeddings,
+                self.head_dim,
+                self.rope_theta,
+                dtype=torch.float32,
+                device=self.q_proj.weight.device,
+            )[0],
+            persistent=False,
+        )
+        self.register_buffer(
+            "sin_cached",
+            self._compute_rope_embeddings(
+                self.max_position_embeddings,
+                self.head_dim,
+                self.rope_theta,
+                dtype=torch.float32,
+                device=self.q_proj.weight.device,
+            )[1],
+            persistent=False,
+        )
+    def _compute_rope_embeddings(self, max_position_embeddings, head_dim, base=10000, dtype=None, device=None):
+        inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device=device).float() / head_dim))
+        t = torch.arange(max_position_embeddings, device=device, dtype=torch.float32)
+        freqs = torch.einsum("i,j->ij", t, inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos = emb.cos().to(dtype)
+        sin = emb.sin().to(dtype)
+        return cos.unsqueeze(0), sin.unsqueeze(0)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        # In B S (H D)
+        bsz, seq_len, _ = hidden_states.size()
+        if position_ids is None:
+            position_ids = torch.arange(seq_len, device=hidden_states.device)
+            position_ids = repeat(position_ids, 'l -> b l', b=bsz)
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = rearrange(query_states, "b s (h d) -> b s h d", h=self.num_heads, d=self.head_dim)
+        key_states = rearrange(key_states, "b s (h d) -> b s h d", h=self.num_key_value_heads, d=self.head_dim)
+        value_states = rearrange(value_states, "b s (h d) -> b s h d", h=self.num_key_value_heads, d=self.head_dim)
+        # Slice off position specific rope freqs from the cached freqs
+        cos = self.cos_cached[:, position_ids]  # [1, bsz, seq_len, dim]
+        sin = self.sin_cached[:, position_ids]  # [1, bsz, seq_len, dim]
+        query_states, key_states = LigerRopeFunction.apply(
+            query_states,
+            key_states,
+            cos.squeeze(0),
+            sin.squeeze(0),
+            position_ids
+        )
+        attn_output = flash_attn_func(
+            query_states,
+            key_states,
+            value_states,
+            dropout_p=0.0,
+            causal=attention_mask is None
+        )
+        attn_output = rearrange(attn_output, "b s h d -> b s (h d)")
+        return self.o_proj(attn_output)

llama_modeling/config.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from dataclasses import dataclass
+@dataclass
+class LlamaConfig:
+    hidden_size: int = 576
+    num_attention_heads: int = 16
+    num_key_value_heads: int = 4
+    num_hidden_layers: int = 30
+    intermediate_size: int = 1536
+    hidden_act: str = "silu"
+    rms_norm_eps: float = 1e-5
+    vocab_size: int = 49152
+    max_position_embeddings: int = 8192
+    rope_theta: int = 100000
+    tie_word_embeddings: bool = False

llama_modeling/decoder.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .mlp import LlamaMLP
+from .config import LlamaConfig
+from .rms_norm import LlamaRMSNorm
+from .attention import LlamaAttention
+from .diff_attn import DifferentialAttention
+from .tensor_prod_attn import CausalTensorProductSelfAttn
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig, layer_num):
+        super().__init__()
+        self.self_attn = CausalTensorProductSelfAttn(config)
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states

llama_modeling/diff_attn.py ADDED Viewed

	@@ -0,0 +1,150 @@

+from flash_attn import flash_attn_func
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from .extact import xATGLU
+from .liger_rope import LigerRopeFunction
+from .config import LlamaConfig
+# The four-flash attn strategy comes from here:
+# https://github.com/microsoft/unilm/blob/master/Diff-Transformer/multihead_flashdiff_2.py
+class DifferentialAttention(nn.Module):
+    def __init__(self, config: LlamaConfig, layer_num):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.num_kv_heads = config.num_key_value_heads
+        self.n_rep = self.num_heads // self.num_kv_heads
+        self.head_dim = self.hidden_size // (2 * self.num_heads)
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.scaling = self.head_dim ** -0.5
+        self.q_proj = nn.Linear(self.hidden_size, 2 * self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, 2 * self.num_kv_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, 2 * self.num_kv_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(2 * self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.lambda_init = 0.8 - 0.6 * math.exp(-0.3 * layer_num)
+        self.lambda_q1 = nn.Parameter(torch.zeros(self.head_dim).normal_(0, 0.1))
+        self.lambda_k1 = nn.Parameter(torch.zeros(self.head_dim).normal_(0, 0.1))
+        self.lambda_q2 = nn.Parameter(torch.zeros(self.head_dim).normal_(0, 0.1))
+        self.lambda_k2 = nn.Parameter(torch.zeros(self.head_dim).normal_(0, 0.1))
+        self.subln = nn.LayerNorm(2 * self.head_dim, elementwise_affine=False)
+        self.register_buffer(
+            "cos_cached",
+            self._compute_rope_embeddings(
+                self.max_position_embeddings,
+                self.head_dim,
+                self.rope_theta,
+                dtype=torch.float32,
+                device=self.q_proj.weight.device,
+            )[0],
+            persistent=False,
+        )
+        self.register_buffer(
+            "sin_cached",
+            self._compute_rope_embeddings(
+                self.max_position_embeddings,
+                self.head_dim,
+                self.rope_theta,
+                dtype=torch.float32,
+                device=self.q_proj.weight.device,
+            )[1],
+            persistent=False,
+        )
+    def _compute_rope_embeddings(self, max_position_embeddings, head_dim, base=10000, dtype=None, device=None):
+        inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device=device).float() / head_dim))
+        t = torch.arange(max_position_embeddings, device=device, dtype=torch.float32)
+        freqs = torch.einsum("i,j->ij", t, inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos = emb.cos().to(dtype)
+        sin = emb.sin().to(dtype)
+        return cos.unsqueeze(0), sin.unsqueeze(0)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+    ) -> torch.Tensor:
+        bsz, seq_len, embed_dim = hidden_states.size()
+        if position_ids is None:
+            position_ids = torch.arange(seq_len, device=hidden_states.device)
+            position_ids = repeat(position_ids, 'l -> b l', b=bsz)
+        q = self.q_proj(hidden_states)
+        k = self.k_proj(hidden_states)
+        v = self.v_proj(hidden_states)
+        q = rearrange(q, 'b s (h d) -> b s h d', h=2*self.num_heads, d=self.head_dim)
+        k = rearrange(k, 'b s (h d) -> b s h d', h=2*self.num_kv_heads, d=self.head_dim)
+        # Reshaped for GQA
+        v = rearrange(v, 'b s (h g d) -> b s h g d', h=self.num_kv_heads, g=2, d=self.head_dim)
+        # Apply rotary embeddings using LigerRopeFunction
+        cos = self.cos_cached[:, position_ids]  # [1, bsz, seq_len, dim]
+        sin = self.sin_cached[:, position_ids]  # [1, bsz, seq_len, dim]
+        q, k = LigerRopeFunction.apply(q, k, cos, sin, position_ids)
+        # Rearrange into GQA style
+        q = rearrange(q, 'b s (h g) d -> b s h g d', h=self.num_heads, g=2)
+        k = rearrange(k, 'b s (h g) d -> b s h g d', h=self.num_kv_heads, g=2)
+        q1, q2 = q[:, :, :, 0], q[:, :, :, 1]
+        k1, k2 = k[:, :, :, 0], k[:, :, :, 1]
+        v1, v2 = v[:, :, :, 0], v[:, :, :, 1]
+        # First attention group on q1/k1 and the v's
+        attn11 = flash_attn_func(
+            q1,
+            k1,
+            v1,
+            dropout_p=0.0, # @Z TODO::
+            causal=attention_mask is None
+        )
+        attn12 = flash_attn_func(
+            q1,
+            k1,
+            v2,
+            dropout_p=0.0,
+            causal=attention_mask is None
+        )
+        attn1 = torch.cat([attn11, attn12], dim=-1)
+        # Second attention group on q2/k2 and the v's
+        attn21 = flash_attn_func(
+            q2,
+            k2,
+            v1,
+            dropout_p=0.0,
+            causal=attention_mask is None
+        )
+        attn22 = flash_attn_func(
+            q2,
+            k2,
+            v2,
+            dropout_p=0.0,
+            causal=attention_mask is None
+        )
+        attn2 = torch.cat([attn21, attn22], dim=-1)
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1).float()).type_as(q)
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1).float()).type_as(q)
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+        attn = attn1 - lambda_full * attn2
+        attn = self.subln(attn)
+        attn = attn * (1 - self.lambda_init)
+        attn_output = rearrange(attn, "b s h d -> b s (h d)")
+        return self.o_proj(attn_output)

llama_modeling/extact.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# Very similar to GeGLU or SwiGLU, there's a learned gate FN, uses arctan as the activation fn.
+class xATGLU(nn.Module):
+    def __init__(self, input_dim, output_dim, bias=True):
+        super().__init__()
+        # GATE path | VALUE path
+        self.proj = nn.Linear(input_dim, output_dim * 2, bias=bias)
+        nn.init.kaiming_normal_(self.proj.weight, nonlinearity='linear')
+        self.alpha = nn.Parameter(torch.zeros(1))
+        self.half_pi = torch.pi / 2
+        self.inv_pi = 1 / torch.pi
+    def forward(self, x):
+        projected = self.proj(x)
+        gate_path, value_path = projected.chunk(2, dim=-1)
+        # Apply arctan gating with expanded range via learned alpha -- https://arxiv.org/pdf/2405.20768
+        gate = (torch.arctan(gate_path) + self.half_pi) * self.inv_pi
+        expanded_gate = gate * (1 + 2 * self.alpha) - self.alpha
+        return expanded_gate * value_path  # g(x) × y

llama_modeling/front_end.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from .config import LlamaConfig
+from .model import LlamaModel
+class LlamaForCausalLM(nn.Module):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.model = LlamaModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Weight tying uses the head weights as the classifier for the token embeddings for both in and out.
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self._init_weights()
+    def _init_weights(self):
+        """Initialize weights for all layers."""
+        # Initialize embeddings
+        if hasattr(self.model, 'embed_tokens'):
+            nn.init.normal_(self.model.embed_tokens.weight, mean=0.0, std=0.041666666666666664)
+        # Initialize linear layers
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                # Xavier/Glorot initialization for weights
+                nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    # Zero initialization for biases
+                    nn.init.zeros_(module.bias)
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        hidden_states = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+        return hidden_states, self.lm_head.weight
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: torch.LongTensor,
+        max_new_tokens: int = 30,
+        temperature: float = 0.0,
+    ) -> torch.LongTensor:
+        self.eval()
+        bsz, seq_len = input_ids.shape
+        position_ids = repeat(
+            torch.arange(seq_len, device=input_ids.device),
+            'l -> b l',
+            b=bsz
+        )
+        for _ in range(max_new_tokens):
+            hidden_states, classifier_weights = self.forward(input_ids, position_ids=position_ids)
+            # Get logits by computing hidden_states @ classifier_weights.T
+            next_token_logits = hidden_states[:, -1] @ classifier_weights.T
+            if temperature == 0:
+                next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+            else:
+                scaled_logits = next_token_logits / temperature
+                probs = torch.softmax(scaled_logits, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+            input_ids = torch.cat([input_ids, next_token], dim=1)
+            new_position_ids = position_ids[:, -1:] + 1
+            position_ids = torch.cat([position_ids, new_position_ids], dim=1)
+        return input_ids

llama_modeling/liger_rope.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import torch
+import triton
+import triton.language as tl
+# https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/rope.py
+# BSD 2-CLAUSE LICENSE
+# Copyright 2024 LinkedIn Corporation
+# All Rights Reserved.
+# Redistribution and use in source and binary forms, with or
+# without modification, are permitted provided that the following
+# conditions are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following
+# disclaimer in the documentation and/or other materials provided
+# with the distribution.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+@triton.jit
+def _triton_rope(
+    q_ptr,
+    q_row_stride,
+    k_ptr,
+    k_row_stride,
+    cos,
+    cos_row_stride,
+    sin,
+    sin_row_stride,
+    sl,
+    bs: tl.constexpr,
+    cos_bs: tl.constexpr,
+    n_qh: tl.constexpr,
+    n_kh: tl.constexpr,
+    hd: tl.constexpr,
+    pad_n_qh: tl.constexpr,
+    pad_n_kh: tl.constexpr,
+    pad_hd: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    BACKWARD_PASS: tl.constexpr = False,
+):
+    # q size: (bsz, seq_len, num_q_heads, head_dim)
+    # q stride: (seq_len * num_q_heads * head_dim, num_q_heads * head_dim, head_dim, 1)
+    # k size: (bsz, seq_len, num_kv_heads, head_dim)
+    # k stride: (seq_len * num_kv_heads * head_dim, num_kv_heads * head_dim, head_dim, 1)
+    # cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+    # stride: (seq_len * head_dim, head_dim, 1)
+    pid = tl.program_id(0)
+    # locate start address
+    q_ptr = q_ptr + pid * q_row_stride
+    k_ptr = k_ptr + pid * k_row_stride
+    # ####################################################################
+    # get the cos(mθ_{i...d/2}) and sin(mθ_{i...d/2}) for token position
+    # m of this program instance
+    # ####################################################################
+    # 1. program instances are laid out in a 1D vector of size bsz * seq_len, which
+    # effectively represents a 2D grid of size [bsz, seq_len] with seq_len dimension
+    # being the fastest changing dimension. Thus we can simply do pid // sl to get the batch index
+    # and pid % sl to get the sequence index.
+    # 2. We only need the left half of cos and sin matrix because the right half is just
+    # a clone of the left half.
+    batch_idx = pid // sl
+    cos_row_idx = pid % sl
+    cos = cos + tl.where(
+        cos_bs == 1,
+        cos_row_idx * cos_row_stride,
+        batch_idx * (sl * cos_row_stride) + cos_row_idx * cos_row_stride,
+    )
+    sin = sin + tl.where(
+        cos_bs == 1,
+        cos_row_idx * sin_row_stride,
+        batch_idx * (sl * sin_row_stride) + cos_row_idx * sin_row_stride,
+    )
+    cos_offsets = tl.arange(0, pad_hd // 2)
+    cos_mask = cos_offsets < hd // 2
+    cos_row = tl.load(cos + cos_offsets, mask=cos_mask, other=0)
+    sin_row = tl.load(sin + cos_offsets, mask=cos_mask, other=0)
+    # ####################################################################
+    # Load the left and right half of q and k for the current
+    # program instance (i.e. for the current token) separately
+    # ####################################################################
+    # left half of the head
+    first_half_q_offsets = tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    first_half_k_offsets = tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+    first_q_mask = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (tl.arange(0, pad_hd // 2)[None, :] < hd // 2)
+    first_k_mask = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (tl.arange(0, pad_hd // 2)[None, :] < hd // 2)
+    q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(sin_row.dtype)
+    k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(sin_row.dtype)
+    # right half of the head
+    second_half_q_offsets = first_half_q_offsets + (hd // 2)
+    second_half_k_offsets = first_half_k_offsets + (hd // 2)
+    second_q_mask = first_q_mask
+    second_k_mask = first_k_mask
+    q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(sin_row.dtype)
+    k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(sin_row.dtype)
+    if not BACKWARD_PASS:
+        # y = [x1, x2] * [cos, cos] + [-x2, x1] * [sin, sin]
+        new_q_tile_1 = q_tile_1 * cos_row - q_tile_2 * sin_row
+        tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask)
+        new_q_tile_2 = q_tile_2 * cos_row + q_tile_1 * sin_row
+        tl.store(q_ptr + second_half_q_offsets, new_q_tile_2, mask=second_q_mask)
+        new_k_tile_1 = k_tile_1 * cos_row - k_tile_2 * sin_row
+        tl.store(k_ptr + first_half_k_offsets, new_k_tile_1, mask=first_k_mask)
+        new_k_tile_2 = k_tile_2 * cos_row + k_tile_1 * sin_row
+        tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask)
+    else:
+        # with some math, we can get:
+        # dy = [dx1, dx2] * [cos, cos] + [-dx2, dx1] * [-sin, -sin]
+        new_q_tile_1 = q_tile_1 * cos_row + q_tile_2 * sin_row
+        tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask)
+        new_q_tile_2 = q_tile_2 * cos_row - q_tile_1 * sin_row
+        tl.store(q_ptr + second_half_q_offsets, new_q_tile_2, mask=second_q_mask)
+        new_k_tile_1 = k_tile_1 * cos_row + k_tile_2 * sin_row
+        tl.store(k_ptr + first_half_k_offsets, new_k_tile_1, mask=first_k_mask)
+        new_k_tile_2 = k_tile_2 * cos_row - k_tile_1 * sin_row
+        tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask)
+def rope_forward(q, k, cos, sin):
+    # transpose it back to the physical shape because Triton looks at the physical storage
+    # note: q and k are incontiguous before the transformation and will become contiguous after transpose
+    batch_size, seq_len, n_q_head, head_dim = q.shape
+    n_kv_head = k.shape[2]
+    pad_hd = triton.next_power_of_2(head_dim)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+    BLOCK_SIZE = max(pad_n_q_head, pad_n_kv_head)
+    n_row = batch_size * seq_len
+    # ensure tensors passed into the kernel are contiguous. It will be no-op if they are already contiguous
+    q = q.contiguous()
+    k = k.contiguous()
+    cos = cos.contiguous()
+    sin = sin.contiguous()
+    cos_batch_size = cos.shape[0]
+    _triton_rope[(n_row,)](
+        q,
+        q.stride(1),
+        k,
+        k.stride(1),
+        cos,
+        cos.stride(-2),
+        sin,
+        sin.stride(-2),
+        seq_len,
+        batch_size,
+        cos_batch_size,
+        n_q_head,
+        n_kv_head,
+        head_dim,
+        pad_n_q_head,
+        pad_n_kv_head,
+        pad_hd,
+        BLOCK_SIZE=BLOCK_SIZE,
+        BACKWARD_PASS=False,
+    )
+    return q, k, cos, sin
+def rope_backward(dq, dk, cos, sin):
+    batch_size, seq_len, n_q_head, head_dim = dq.shape
+    cos_batch_size = cos.shape[0]
+    n_kv_head = dk.shape[2]
+    pad_hd = triton.next_power_of_2(head_dim)
+    pad_n_q_head = triton.next_power_of_2(n_q_head)
+    pad_n_kv_head = triton.next_power_of_2(n_kv_head)
+    BLOCK_SIZE = max(pad_n_q_head, pad_n_kv_head)
+    n_row = batch_size * seq_len
+    # ensure dq and dk are contiguous
+    dq = dq.contiguous()
+    dk = dk.contiguous()
+    # backward is similar to forward except swapping few ops
+    _triton_rope[(n_row,)](
+        dq,
+        dq.stride(1),
+        dk,
+        dk.stride(1),
+        cos,
+        cos.stride(-2),
+        sin,
+        sin.stride(-2),
+        seq_len,
+        batch_size,
+        cos_batch_size,
+        n_q_head,
+        n_kv_head,
+        head_dim,
+        pad_n_q_head,
+        pad_n_kv_head,
+        pad_hd,
+        BLOCK_SIZE=BLOCK_SIZE,
+        BACKWARD_PASS=True,
+    )
+    return dq, dk
+class LigerRopeFunction(torch.autograd.Function):
+    """
+    Triton implementation of the Rotary Positional Embedding (RoPE) operation. Please note that
+    this implements the HuggingFace Llama & Mistral version, whose rotation matrix is slightly different
+    than the original RoPE paper.
+    Please find the corresponding HuggingFace implementation here:
+    https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/llama/modeling_llama.py#L184
+    For more details about the rotation matrix used here, please refer to:
+    https://discuss.huggingface.co/t/is-llama-rotary-embedding-implementation-correct/44509/2
+    """
+    @staticmethod
+    def forward(ctx, q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+        """
+        q size: (bsz, n_q_head, seq_len, head_dim)
+        k size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        sin size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        """
+        q, k, cos, sin = rope_forward(q, k, cos, sin)
+        ctx.save_for_backward(cos, sin)
+        return q, k
+    def backward(ctx, dq, dk):
+        """
+        dq size: (bsz, n_q_head, seq_len, head_dim)
+        dk size: (bsz, n_kv_head, seq_len, head_dim)
+        cos size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        sin size: (1, seq_len, head_dim) or (bsz, seq_len, head_dim)
+        """
+        cos, sin = ctx.saved_tensors
+        dq, dk = rope_backward(dq, dk, cos, sin)
+        return dq, dk, None, None, None, None

llama_modeling/mlp.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .config import LlamaConfig
+class LlamaMLP(nn.Module):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.act_fn = nn.SiLU()
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))

llama_modeling/model.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .mlp import LlamaMLP
+from .config import LlamaConfig
+from .rms_norm import LlamaRMSNorm
+from .decoder import LlamaDecoderLayer
+class LlamaModel(nn.Module):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=None)
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config, i) for i in range(config.num_hidden_layers)])
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        for decoder_layer in self.layers:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+            )
+        hidden_states = self.norm(hidden_states)
+        return hidden_states

llama_modeling/rms_norm.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-5):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)

llama_modeling/rope.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def rotate_half(x):
+    x1, x2 = torch.chunk(x, 2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin):
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class LlamaRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=8192, base=10000):
+        super().__init__()
+        self.dim = dim
+        self.base = base
+        self.max_position_embeddings = max_position_embeddings
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+    def forward(self, position_ids: torch.LongTensor):
+        # position_ids: [batch_size, seq_len]
+        inv_freq = self.inv_freq.to(device=position_ids.device)
+        inv_freq_expanded = inv_freq[None, None, :]  # [1, 1, dim//2]
+        position_ids_expanded = position_ids[:, :, None].float()  # [batch_size, seq_len, 1]
+        freqs = torch.matmul(position_ids_expanded, inv_freq_expanded)  # [batch_size, seq_len, dim//2]
+        freqs = torch.cat([freqs, freqs], dim=-1)  # [batch_size, seq_len, dim]
+        cos = torch.cos(freqs)
+        sin = torch.sin(freqs)
+        cos = cos.unsqueeze(1)  # [batch_size, 1, seq_len, dim]
+        sin = sin.unsqueeze(1)  # [batch_size, 1, seq_len, dim]
+        return cos, sin

llama_modeling/tensor_prod_attn.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple
+from dataclasses import dataclass
+from einops import rearrange, repeat
+from flash_attn import flash_attn_func
+from .liger_rope import LigerRopeFunction
+from .rms_norm import LlamaRMSNorm
+from .config import LlamaConfig
+class CPLinear(nn.Module):
+    def __init__(self, in_features, n_head, head_dim, kv_rank=2, q_rank=6):
+        super().__init__()
+        self.W_A_q = nn.Linear(in_features, n_head * q_rank, bias=False)
+        self.W_B_q = nn.Linear(in_features, q_rank * head_dim, bias=False)
+        self.W_A_k = nn.Linear(in_features, n_head * kv_rank, bias=False)
+        self.W_B_k = nn.Linear(in_features, kv_rank * head_dim, bias=False)
+        self.W_A_v = nn.Linear(in_features, n_head * kv_rank, bias=False)
+        self.W_B_v = nn.Linear(in_features, kv_rank * head_dim, bias=False)
+        nn.init.xavier_uniform_(self.W_A_q.weight)
+        nn.init.xavier_uniform_(self.W_B_q.weight)
+        nn.init.xavier_uniform_(self.W_A_k.weight)
+        nn.init.xavier_uniform_(self.W_B_k.weight)
+        nn.init.xavier_uniform_(self.W_A_v.weight)
+        nn.init.xavier_uniform_(self.W_B_v.weight)
+        self.n_head = n_head
+        self.q_rank = q_rank
+        self.head_dim = head_dim
+        self.kv_rank = kv_rank
+    def forward(self, x):
+        batch_size, seq_len, _ = x.size()
+        A_q = self.W_A_q(x).view(batch_size, seq_len, self.n_head, self.q_rank)
+        A_k = self.W_A_k(x).view(batch_size, seq_len, self.n_head, self.kv_rank)
+        A_v = self.W_A_v(x).view(batch_size, seq_len, self.n_head, self.kv_rank)
+        B_q = self.W_B_q(x).view(batch_size, seq_len, self.q_rank, self.head_dim)
+        B_k = self.W_B_k(x).view(batch_size, seq_len, self.kv_rank, self.head_dim)
+        B_v = self.W_B_v(x).view(batch_size, seq_len, self.kv_rank, self.head_dim)
+        A_q = A_q.view(batch_size * seq_len, self.n_head, self.q_rank)
+        A_k = A_k.view(batch_size * seq_len, self.n_head, self.kv_rank)
+        A_v = A_v.view(batch_size * seq_len, self.n_head, self.kv_rank)
+        B_q = B_q.view(batch_size * seq_len, self.q_rank, self.head_dim)
+        B_k = B_k.view(batch_size * seq_len, self.kv_rank, self.head_dim)
+        B_v = B_v.view(batch_size * seq_len, self.kv_rank, self.head_dim)
+        q = torch.bmm(A_q, B_q).div_(self.q_rank).view(batch_size, seq_len, self.n_head, self.head_dim)
+        k = torch.bmm(A_k, B_k).div_(self.kv_rank).view(batch_size, seq_len, self.n_head, self.head_dim)
+        v = torch.bmm(A_v, B_v).div_(self.kv_rank).view(batch_size, seq_len, self.n_head, self.head_dim)
+        return q, k, v
+class CausalTensorProductSelfAttn(nn.Module):
+    def __init__(self, config, kv_rank=2, q_rank=6):
+        super().__init__()
+        self.n_head = config.num_attention_heads
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.n_embd = config.hidden_size
+        self.rank = kv_rank
+        self.q_rank = q_rank
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.c_qkv = CPLinear(self.n_embd, self.n_head, self.head_dim, self.rank, self.q_rank)
+        self.o_proj = nn.Linear(self.n_head * self.head_dim, self.n_embd, bias=False)
+        self.register_buffer(
+            "cos_cached",
+            self._compute_rope_embeddings(
+                self.max_position_embeddings,
+                self.head_dim,
+                self.rope_theta,
+                dtype=torch.float32,
+                device=self.o_proj.weight.device,
+            )[0],
+            persistent=False,
+        )
+        self.register_buffer(
+            "sin_cached",
+            self._compute_rope_embeddings(
+                self.max_position_embeddings,
+                self.head_dim,
+                self.rope_theta,
+                dtype=torch.float32,
+                device=self.o_proj.weight.device,
+            )[1],
+            persistent=False,
+        )
+        self.using_groupnorm = getattr(config, 'using_groupnorm', False)
+        self.subln = LlamaRMSNorm(self.head_dim, eps=1e-5)
+    def _compute_rope_embeddings(self, max_position_embeddings, head_dim, base=10000, dtype=None, device=None):
+        inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device=device).float() / head_dim))
+        t = torch.arange(max_position_embeddings, device=device, dtype=torch.float32)
+        freqs = torch.einsum("i,j->ij", t, inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos = emb.cos().to(dtype)
+        sin = emb.sin().to(dtype)
+        return cos.unsqueeze(0), sin.unsqueeze(0)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        # In B S (H D)
+        bsz, seq_len, _ = hidden_states.size()
+        if position_ids is None:
+            position_ids = torch.arange(seq_len, device=hidden_states.device)
+            position_ids = repeat(position_ids, 'l -> b l', b=bsz)
+        q, k, v = self.c_qkv(hidden_states) # B S (HD) -> B S H D
+        cos = self.cos_cached[:, position_ids]  # [1, bsz, seq_len, dim]
+        sin = self.sin_cached[:, position_ids]  # [1, bsz, seq_len, dim]
+        q, k = LigerRopeFunction.apply(
+            q,
+            k,
+            cos.squeeze(0),
+            sin.squeeze(0),
+            position_ids
+        )
+        attn_out = flash_attn_func(
+            q,
+            k,
+            v,
+            dropout_p=0.0,
+            causal=attention_mask is None
+        )
+        attn_out = self.subln(attn_out)
+        attn_out = rearrange(attn_out, "b s h d -> b s (h d)")
+        attn_out = self.o_proj(attn_out)
+        return attn_out

test-train.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import torch
+import torch.nn as nn
+from torch.cuda.amp import autocast
+from torch.utils.data import Dataset, DataLoader
+from tqdm import tqdm
+import math, os, sys, json, glob, time, random
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from transformers import AutoTokenizer
+from distributed_shampoo import AdamGraftingConfig, DistributedShampoo
+from cut_cross_entropy import linear_cross_entropy
+from torch.nn.utils import clip_grad_norm_
+from utils.trainutils import count_parameters_layerwise, save_checkpoint, TBLogger
+from llama_modeling.front_end import LlamaForCausalLM
+from llama_modeling.config import LlamaConfig
+class JSONLDataset(Dataset):
+    def __init__(self, directory_path, tokenizer, seq_length=1024,
+                 text_key="text", max_files=None, batch_size=1000,
+                 pad_token_id=0):
+        self.seq_length = seq_length
+        self.tokenizer = tokenizer
+        self.pad_token_id = pad_token_id
+        self.sequences = []
+        files = glob.glob(os.path.join(directory_path, "*.jsonl"))
+        if max_files is not None:
+            files = files[:max_files]
+        text_batch = []
+        for file_idx, file_path in enumerate(files):
+            with open(file_path, 'r', encoding='utf-8') as f:
+                for line in f:
+                    try:
+                        data = json.loads(line)
+                        text = data.get(text_key, "")
+                        if len(text) >= 100:
+                            text_batch.append(text)
+                            if len(text_batch) >= batch_size:
+                                self._process_batch(text_batch)
+                                text_batch = []
+                    except:
+                        continue
+        if text_batch:
+            self._process_batch(text_batch)
+        if self.sequences:
+            self.sequences = torch.tensor(self.sequences, dtype=torch.long)
+        else:
+            self.sequences = torch.empty((0, seq_length), dtype=torch.long)
+    def _process_batch(self, texts):
+        encoded = self.tokenizer(
+            texts,
+            add_special_tokens=False,
+            truncation=True,
+            padding=False,
+            return_attention_mask=False,
+            return_tensors=None
+        )['input_ids']
+        mlen = 0
+        for token_ids in encoded:
+            for i in range(0, len(token_ids), self.seq_length):
+                chunk = token_ids[i:i+self.seq_length]
+                # Pad
+                if len(chunk) < self.seq_length:
+                    chunk += [self.pad_token_id] * (self.seq_length - len(chunk))
+                self.sequences.append(chunk)
+                mlen = max(mlen, len(chunk))
+        print("MAX: ", mlen)
+    def __len__(self):
+        return len(self.sequences)
+    def __getitem__(self, idx):
+        return self.sequences[idx]
+def train_model(model, train_loader, optimizer, device, epochs=5, forward_dtype=torch.float32):
+    model.train()
+    criterion = nn.CrossEntropyLoss()
+    scaler = torch.amp.GradScaler("cuda")
+    logger = TBLogger(log_dir=f'logs/run-{time.time()}')
+    total_steps = len(train_loader) * epochs
+    scheduler = CosineAnnealingLR(
+        optimizer,
+        T_max=total_steps,
+        eta_min=5e-6
+    )
+    model = torch.compile(
+        model,
+    )
+    global_step = 0
+    for epoch in range(epochs):
+        running_loss = 0.0
+        total_batches = 0
+        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}')
+        for batch_idx, data in enumerate(progress_bar):
+            data = data.to(device)
+            optimizer.zero_grad(set_to_none=True)
+            with torch.autocast(device_type='cuda', dtype=forward_dtype):
+                hidden_states, classifier_weights = model(data)
+                loss = linear_cross_entropy(
+                    hidden_states,
+                    classifier_weights,
+                    data,
+                    shift=True,
+                    reduction="mean"
+                )
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)
+            clip_grad_norm_(model.parameters(), max_norm=1.0)
+            scaler.step(optimizer)
+            scaler.update()
+            scheduler.step()
+            # Update metrics - just add the loss itself
+            running_loss += loss.item()
+            total_batches += 1
+            global_step += 1
+            avg_loss = running_loss / total_batches
+            perplexity = math.exp(min(avg_loss, 100))
+            progress_bar.set_postfix({
+                'loss': f'{avg_loss:.4f}',
+                'ppl': f'{perplexity:.2f}'
+            })
+            metrics = {
+                'loss': loss.item(),
+                'perplexity': perplexity,
+                'learning_rate': optimizer.param_groups[0]['lr'],
+                'batch_size': data.size(0)
+            }
+            logger.log(metrics, step=global_step, model=model, grad_checking=True)
+            if batch_idx % 100 == 0:
+                print(f'\nBatch {batch_idx}/{len(train_loader)}: '
+                      f'Loss: {avg_loss:.4f}, '
+                      f'Perplexity: {perplexity:.2f}, '
+                      f'Batches Processed: {total_batches}')
+        epoch_loss = running_loss / total_batches
+        epoch_ppl = math.exp(min(epoch_loss, 100))
+        print(f'\nEpoch {epoch+1} Summary:')
+        print(f'Average Loss: {epoch_loss:.4f}')
+        print(f'Perplexity: {epoch_ppl:.2f}')
+        print(f'Total Batches Processed: {total_batches}\n')
+        save_checkpoint(model, f'epoch_{epoch+1}.safetensors')
+def sample_examples(dataset, tokenizer, num_samples=5):
+    if len(dataset) == 0:
+        print("The dataset is empty.")
+        return
+    num_samples = min(num_samples, len(dataset))
+    sampled_indices = random.sample(range(len(dataset)), num_samples)
+    for i, idx in enumerate(sampled_indices):
+        sequence = dataset[idx]
+        print(f"Sample {i + 1} (Index {idx}):")
+        print(sequence)
+        decoded_text = tokenizer.decode(sequence, skip_special_tokens=False, decode_special_tokens=False)
+        print(decoded_text)
+        print("-" * 40)
+def main():
+    BATCH_SIZE = 36
+    SEQ_LENGTH = 512
+    EPOCHS = 3
+    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+    tokenizer = AutoTokenizer.from_pretrained("./SmolLM2-135M-Instruct")
+    config_path = "config.json"
+    with open(config_path) as f:
+        config_dict = json.load(f)
+    config = LlamaConfig(**{k: v for k, v in config_dict.items() if k in LlamaConfig.__dataclass_fields__})
+    model = LlamaForCausalLM(config).to("cuda")
+    dataset = JSONLDataset(
+        directory_path="./Data_big",
+        tokenizer=tokenizer,
+        seq_length=SEQ_LENGTH,
+        text_key="text",
+        max_files=None,
+    )
+    train_loader = DataLoader(
+        dataset,
+        batch_size=BATCH_SIZE,
+        shuffle=True,
+        num_workers=4,
+        pin_memory=True,
+        drop_last=True
+    )
+    optimizer = DistributedShampoo(
+        model.parameters(),
+        lr=0.0001,
+        betas=(0.9, 0.999),
+        epsilon=1e-12,
+        weight_decay=1e-05,
+        max_preconditioner_dim=2048,
+        precondition_frequency=100,
+        start_preconditioning_step=250,
+        use_decoupled_weight_decay=False,
+        grafting_config=AdamGraftingConfig(
+            beta2=0.999,
+            epsilon=1e-12,
+        ),
+    )
+    print("*"*100)
+    torch.set_float32_matmul_precision('high')
+    count_parameters_layerwise(model)
+    train_model(model, train_loader, optimizer, DEVICE, EPOCHS, forward_dtype=torch.bfloat16)
+if __name__ == "__main__":
+    main()

utils/__init__.py ADDED Viewed

File without changes

utils/trainutils.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from tensorboardX import SummaryWriter
+from safetensors.torch import save_file, load_file
+from pathlib import Path
+import time
+def count_parameters_layerwise(model):
+    # Layerwise params, turn this into a util function.
+    total_params = 0
+    layer_params = {}
+    for name, parameter in model.named_parameters():
+        if not parameter.requires_grad:
+            continue
+        param_count = parameter.numel()
+        layer_params[name] = param_count
+        total_params += param_count
+    print(f"\nModel Parameter Summary:")
+    print("-" * 60)
+    for name, count in layer_params.items():
+        print(f"{name}: {count:,} parameters")
+    print("-" * 60)
+    print(f"Total Trainable Parameters: {total_params:,}\n")
+    return total_params
+def save_checkpoint(model, filename="checkpoint.safetensors"):
+    if hasattr(model, '_orig_mod'):
+        model = model._orig_mod
+    torch.save(model.state_dict(), filename.replace('.safetensors', '.pt'))
+def load_checkpoint(model, filename="checkpoint.safetensors"):
+    if hasattr(model, '_orig_mod'):
+        model = model._orig_mod
+    try:
+        model_state = load_file(filename)
+        model.load_state_dict(model_state)
+    except Exception as e:
+        model_state = torch.load(filename.replace('.safetensors', '.pt'), weights_only=True)
+        model.load_state_dict(model_state)
+class TBLogger:
+    def __init__(self, log_dir='logs/current_run', flush_secs=10, enable_grad_logging=True):
+        Path(log_dir).mkdir(parents=True, exist_ok=True)
+        self.writer = SummaryWriter(log_dir, flush_secs=flush_secs)
+        self.enable_grad_logging = enable_grad_logging
+        self.start_time = time.time()
+    def log(self, metrics, step=None, model=None, prefix='', grad_checking=False):
+        for name, value in metrics.items():
+            full_name = f"{prefix}{name}" if prefix else name
+            if isinstance(value, (int, float)):
+                self.writer.add_scalar(full_name, value, step)
+            elif isinstance(value, torch.Tensor):
+                self.writer.add_scalar(full_name, value.item(), step)
+            elif isinstance(value, (list, tuple)) and len(value) > 0:
+                if all(isinstance(x, (int, float)) for x in value):
+                    self.writer.add_histogram(full_name, torch.tensor(value), step)
+        if self.enable_grad_logging and model is not None:
+            self._log_gradients(model, step, grad_checking)
+    def _log_gradients(self, model, step, grad_checking):
+        total_norm = 0.0
+        for name, param in model.named_parameters():
+            if grad_checking and param.grad is not None:
+                # Check for inf/nan in gradients
+                if torch.isnan(param.grad).any():
+                    print(f"Warning: Found nan in gradients for layer: {name}")
+                    continue
+                if torch.isinf(param.grad).any():
+                    print(f"Warning: Found inf in gradients for layer: {name}")
+                    continue
+                param_norm = param.grad.detach().data.norm(2)
+                self.writer.add_scalar(f"gradients/{name}_norm", param_norm, step)
+                total_norm += param_norm.item() ** 2
+        # Only compute total norm if we haven't encountered inf/nan
+        if total_norm > 0:  # This means we had valid gradients
+            total_norm = total_norm ** 0.5
+            self.writer.add_scalar("gradients/total_norm", total_norm, step)
+    def close(self):
+        self.writer.close()