Crystalcareai
/

GemMoE-Beta-1

Text Generation

Transformers

gemmoe

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 14, 2024

Commit

6adf5e4

verified ·

1 Parent(s): 2e51e15

Update modeling_gemmoe.py

Browse files

Files changed (1) hide show

modeling_gemmoe.py +45 -25

modeling_gemmoe.py CHANGED Viewed

@@ -55,6 +55,7 @@ if is_flash_attn_2_available():
 # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
 # It means that the function will not be traced through and simply appear as a node in the graph.
 if is_torch_fx_available():
     if not is_torch_greater_or_equal_than_1_13:
         import torch.fx
@@ -166,42 +167,52 @@ class GemmoeRMSNorm(nn.Module):
         self.weight = nn.Parameter(torch.zeros(dim))
     def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
     def forward(self, x):
-        output = self._norm(x.float()).type_as(x)
-        return output * (self.weight + 1)
 ALL_LAYERNORM_LAYERS.append(GemmoeRMSNorm)
 class GemmoeRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        self.register_buffer("inv_freq", None, persistent=False)
-    @torch.no_grad()
-    def forward(self, x, position_ids, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if self.inv_freq is None:
-            self.inv_freq = 1.0 / (
-                self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim)
-            )
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
-        position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 since bfloat16 loses precision on long contexts
-        # See https://github.com/huggingface/transformers/pull/29285
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 # Copied from transformers.models.llama.modeling_llama.rotate_half
 def rotate_half(x):
@@ -1034,6 +1045,15 @@ class GemmoeModel(GemmoePreTrainedModel):
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
         past_seen_tokens = 0
         if use_cache:  # kept for BC (cache positions)
             if not isinstance(past_key_values, StaticCache):

 # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
 # It means that the function will not be traced through and simply appear as a node in the graph.
 if is_torch_fx_available():
     if not is_torch_greater_or_equal_than_1_13:
         import torch.fx
         self.weight = nn.Parameter(torch.zeros(dim))
     def _norm(self, x):
+        # Ensure the entire normalization is done in float32
+        x_float = x.float()  # upcast to float32
+        mean = x_float.pow(2).mean(-1, keepdim=True)
+        normed_x = x_float * torch.rsqrt(mean + self.eps)
+        return normed_x
     def forward(self, x):
+        normed_x = self._norm(x)
+        # Downcast the result to the original dtype at the end
+        normed_x = normed_x.type_as(x)
+        return normed_x * (self.weight + 1)
 ALL_LAYERNORM_LAYERS.append(GemmoeRMSNorm)
 class GemmoeRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
+        self._set_cos_sin_cache(seq_len=max_position_embeddings, device=device, dtype=torch.get_default_dtype())
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        freq_exponents = (2.0 / self.dim) * (
+            torch.arange(self.dim // 2, dtype=torch.int64, device="cpu").float()
+        )
+        timescale = self.base ** freq_exponents
+        positions = torch.arange(self.max_seq_len_cached, device="cpu", dtype=torch.int64).float()
+        radians_new = positions[..., None] / timescale[None, None, :]
+        radians_new = radians_new.squeeze(0)
+        emb = torch.cat((radians_new, radians_new), dim=-1)
+        cos = emb.cos().to(device=device, non_blocking=True)
+        sin = emb.sin().to(device=device, non_blocking=True)
+        self.register_buffer("cos_cached", cos, persistent=False)
+        self.register_buffer("sin_cached", sin, persistent=False)
+    def forward(self, x, position_ids=None, seq_len=None):
+        if seq_len is None:
+            seq_len = x.size(2)
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:seq_len],
+            self.sin_cached[:seq_len],
+        )
 # Copied from transformers.models.llama.modeling_llama.rotate_half
 def rotate_half(x):
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
+        # Scale embeddings
+        # Fix for precision issue when casting to bfloat16
+        hidden_size_sqrt = math.sqrt(self.config.hidden_size)
+        if inputs_embeds.dtype == torch.bfloat16:
+            # Use float32 for sqrt calculation to avoid precision loss
+            hidden_size_sqrt = hidden_size_sqrt.astype(torch.float32)
+        hidden_states = inputs_embeds * hidden_size_sqrt
         past_seen_tokens = 0
         if use_cache:  # kept for BC (cache positions)
             if not isinstance(past_key_values, StaticCache):