Upload DogeForCausalLM

Browse files

Files changed (4) hide show

config.json +47 -44
configuration_doge.py +13 -13
generation_config.json +7 -7
modeling_doge.py +19 -18

config.json CHANGED Viewed

@@ -1,44 +1,47 @@
-{
-  "_name_or_path": "./results/Doge-60M",
-  "architectures": [
-    "DogeForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "auto_map": {
-    "AutoConfig": "configuration_doge.DogeConfig",
-    "AutoModelForCausalLM": "modeling_doge.DogeForCausalLM"
-  },
-  "bos_token_id": 0,
-  "dynamic_mask_ratio": 0.0,
-  "eos_token_id": 1,
-  "expert_retrieval_size": 256,
-  "hidden_act": "silu",
-  "hidden_bias": false,
-  "hidden_dropout": 0.0,
-  "hidden_size": 512,
-  "initializer_range": 0.02,
-  "intermediate_size": 1024,
-  "is_moe": false,
-  "max_position_embeddings": 2048,
-  "model_type": "doge",
-  "num_attention_heads": 4,
-  "num_cdmmoe_experts": 2048,
-  "num_cdmmoe_experts_per_head": 8,
-  "num_cdmmoe_heads": 4,
-  "num_channels": 3,
-  "num_hidden_layers": 16,
-  "num_key_value_heads": 2,
-  "pad_token_id": 2,
-  "patch_size": 16,
-  "rms_norm_eps": 1e-06,
-  "rope_scaling": {
-    "factor": 4.0,
-    "original_max_position_embeddings": 2048,
-    "rope_type": "dynamic"
-  },
-  "rope_theta": 10000.0,
-  "torch_dtype": "float32",
-  "transformers_version": "4.49.0.dev0",
-  "use_cache": true,
-  "vocab_size": 32768
-}

+{
+  "_name_or_path": "./results/Doge-60M",
+  "architectures": [
+    "DogeForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_doge.DogeConfig",
+    "AutoModelForCausalLM": "modeling_doge.DogeForCausalLM"
+  },
+  "bos_token_id": 0,
+  "dynamic_mask_ratio": 0.0,
+  "eos_token_id": 1,
+  "expert_retrieval_size": 256,
+  "hidden_act": "silu",
+  "hidden_bias": false,
+  "hidden_dropout": 0.0,
+  "hidden_size": 512,
+  "initializer_range": 0.02,
+  "intermediate_size": 1024,
+  "is_moe": false,
+  "max_position_embeddings": 2048,
+  "model_type": "doge",
+  "num_attention_heads": 4,
+  "num_cdmmoe_experts": 2048,
+  "num_cdmmoe_experts_per_head": 8,
+  "num_cdmmoe_heads": 4,
+  "num_cdmoe_experts": 16348,
+  "num_cdmoe_experts_per_head": 8,
+  "num_cdmoe_heads": 4,
+  "num_channels": 3,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 2,
+  "pad_token_id": 2,
+  "patch_size": 16,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "factor": 4.0,
+    "original_max_position_embeddings": 2048,
+    "rope_type": "dynamic"
+  },
+  "rope_theta": 10000.0,
+  "torch_dtype": "float32",
+  "transformers_version": "4.47.1",
+  "use_cache": true,
+  "vocab_size": 32768
+}

configuration_doge.py CHANGED Viewed

@@ -40,7 +40,7 @@ class DogeConfig(PretrainedConfig):
         hidden_size (`int`, *optional*, defaults to 1024):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 2048):
-            Dimension of the CDMoE representations.
         num_hidden_layers (`int`, *optional*, defaults to 32):
             Number of hidden layers in the Transformer decoder.
         hidden_bias (`bool`, *optional*, defaults to `False`):
@@ -115,13 +115,13 @@ class DogeConfig(PretrainedConfig):
             The ratio to control the proportion of the dynamic mask filled with the minimum value.
         is_moe (`bool`, *optional*, defaults to `False`):
             Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
-        num_cdmmoe_experts (`int`, *optional*, defaults to 2048):
-            Number of Private Experts for the Cross Domain Mixture of Experts.
-        num_cdmmoe_heads (`int`, *optional*, defaults to 4):
             Number of heads of Private Experts for the Cross Domain Mixture of Experts.
-        num_cdmmoe_experts_per_head (`int`, *optional*, defaults to 8):
             Number of Private Experts per head for the Cross Domain Mixture of Experts.
-        expert_retrieval_size (`int`, *optional*, defaults to 256):
             Dimension of the Expert retrieval states for the Cross Domain Mixture of Experts.
     """
@@ -158,10 +158,10 @@ class DogeConfig(PretrainedConfig):
         attention_dropout=0.0,
         dynamic_mask_ratio=0.0,
         is_moe=False,
-        num_cdmmoe_experts=2048,
-        num_cdmmoe_heads=4,
-        num_cdmmoe_experts_per_head=8,
-        expert_retrieval_size=256,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -188,9 +188,9 @@ class DogeConfig(PretrainedConfig):
         self.attention_dropout = attention_dropout
         self.dynamic_mask_ratio = dynamic_mask_ratio
         self.is_moe = is_moe
-        self.num_cdmmoe_experts = num_cdmmoe_experts
-        self.num_cdmmoe_heads = num_cdmmoe_heads
-        self.num_cdmmoe_experts_per_head = num_cdmmoe_experts_per_head
         self.expert_retrieval_size = expert_retrieval_size
         # Validate the correctness of rotary position embeddings parameters

         hidden_size (`int`, *optional*, defaults to 1024):
             Dimension of the hidden representations.
         intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimension of the MLP representations.
         num_hidden_layers (`int`, *optional*, defaults to 32):
             Number of hidden layers in the Transformer decoder.
         hidden_bias (`bool`, *optional*, defaults to `False`):
             The ratio to control the proportion of the dynamic mask filled with the minimum value.
         is_moe (`bool`, *optional*, defaults to `False`):
             Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
+        num_cdmoe_experts (`int`, *optional*, defaults to 16348):
+            Number of Private Experts for the Cross Domain Mixture of Experts. calculation formula: :math:`\text{num_cdmoe_experts} = (32 \times \text{num_cdmoe_heads})^2`
+        num_cdmoe_heads (`int`, *optional*, defaults to 4):
             Number of heads of Private Experts for the Cross Domain Mixture of Experts.
+        num_cdmoe_experts_per_head (`int`, *optional*, defaults to 8):
             Number of Private Experts per head for the Cross Domain Mixture of Experts.
+        expert_retrieval_size (`int`, *optional*, defaults to 64):
             Dimension of the Expert retrieval states for the Cross Domain Mixture of Experts.
     """
         attention_dropout=0.0,
         dynamic_mask_ratio=0.0,
         is_moe=False,
+        num_cdmoe_experts=16348,
+        num_cdmoe_heads=4,
+        num_cdmoe_experts_per_head=8,
+        expert_retrieval_size=64,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.attention_dropout = attention_dropout
         self.dynamic_mask_ratio = dynamic_mask_ratio
         self.is_moe = is_moe
+        self.num_cdmoe_experts = num_cdmoe_experts
+        self.num_cdmoe_heads = num_cdmoe_heads
+        self.num_cdmoe_experts_per_head = num_cdmoe_experts_per_head
         self.expert_retrieval_size = expert_retrieval_size
         # Validate the correctness of rotary position embeddings parameters

generation_config.json CHANGED Viewed

@@ -1,7 +1,7 @@
-{
-  "_from_model_config": true,
-  "bos_token_id": 0,
-  "eos_token_id": 1,
-  "pad_token_id": 2,
-  "transformers_version": "4.49.0.dev0"
-}

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 2,
+  "transformers_version": "4.47.1"
+}

modeling_doge.py CHANGED Viewed

@@ -22,7 +22,6 @@ import math
 from typing import List, Optional, Tuple, Union
 import torch
-from torch.nn.attention.flex_attention import flex_attention
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
@@ -40,6 +39,7 @@ from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     logging,
     replace_return_docstrings,
 )
@@ -50,6 +50,9 @@ try:
 except ImportError:
     einx_add = None
 logger = logging.get_logger(__name__)
@@ -308,12 +311,10 @@ class DogeDynamicMaskAttention(nn.Module):
         min_type = torch.finfo(hidden_states.dtype).min
         attn_mask = dynamic_mask[:, :, None, :]
         if 0.0 < dynamic_mask_ratio < 1.0:
-            rate_value = torch.kthvalue(
-                attn_mask,
-                int(attn_mask.shape[-1] * dynamic_mask_ratio),
-                dim=-1, keepdim=True,
-            ).values
-            attn_mask = attn_mask.masked_fill(attn_mask < rate_value, min_type)
         if attention_mask is not None:
             attn_mask = attn_mask.masked_fill(attention_mask[:, :, :, : hidden_states.shape[-2]] == min_type, min_type)
         return attn_mask
@@ -479,18 +480,18 @@ class DogeCDMoE(DogeMLP):
         self.act_fn = ACT2FN[config.hidden_act]
         self.expert_retrieval_dim = config.expert_retrieval_size
-        self.num_cdmmoe_experts = config.num_cdmmoe_experts
-        self.num_cdmmoe_heads = config.num_cdmmoe_heads
-        self.num_cdmmoe_experts_per_head = config.num_cdmmoe_experts_per_head
-        self.num_keys = int(math.sqrt(self.num_cdmmoe_experts))
         # queries and keys for retrieval experts
-        self.queries = nn.Linear(self.hidden_dim, self.num_cdmmoe_heads * self.expert_retrieval_dim, bias=False)
-        self.keys = nn.Parameter(torch.zeros(self.num_cdmmoe_heads, self.num_keys, 2, self.expert_retrieval_dim // 2))
         # experts
-        self.down_embed  = nn.Embedding(self.num_cdmmoe_experts, self.hidden_dim)
-        self.up_embed = nn.Embedding(self.num_cdmmoe_experts, self.hidden_dim)
     def forward(
         self,
@@ -501,11 +502,11 @@ class DogeCDMoE(DogeMLP):
         # get similarity with queries and keys
         queries = self.queries(hidden_states)
-        queries = queries.view(bsz, seq_len, 2, self.num_cdmmoe_heads, -1).permute(2, 0, 1, 3, 4)
         sim = torch.einsum("p b t h n, h k p n -> p b t h k", queries, self.keys)
         # get experts with the highest similarity
-        (scores_x, scores_y), (indices_x, indices_y) = sim.topk(self.num_cdmmoe_experts_per_head, dim=-1)
         if einx_add is not None:
             all_scores = einx_add("... i, ... j -> ... (i j)", scores_x, scores_y)
             all_indices = einx_add("... i, ... j -> ... (i j)", indices_x * self.num_keys, indices_y)
@@ -514,7 +515,7 @@ class DogeCDMoE(DogeMLP):
             all_scores = all_scores.view(*scores_x.shape[:-1], -1)
             all_indices = (indices_x.unsqueeze(-1) * self.num_keys) + indices_y.unsqueeze(-2)
             all_indices = all_indices.view(*indices_x.shape[:-1], -1)
-        scores, pk_indices = all_scores.topk(self.num_cdmmoe_experts_per_head, dim=-1)
         indices = all_indices.gather(-1, pk_indices)
         down_embed = self.down_embed(indices)
         up_embed = self.up_embed(indices)

 from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from transformers.utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_torch_greater_or_equal,
     logging,
     replace_return_docstrings,
 )
 except ImportError:
     einx_add = None
+if is_torch_greater_or_equal("2.5"):
+    from torch.nn.attention.flex_attention import flex_attention
 logger = logging.get_logger(__name__)
         min_type = torch.finfo(hidden_states.dtype).min
         attn_mask = dynamic_mask[:, :, None, :]
         if 0.0 < dynamic_mask_ratio < 1.0:
+            num_dynamic_mask = int(attn_mask.shape[-1] * dynamic_mask_ratio)
+            if num_dynamic_mask > 0:
+                rate_value = torch.kthvalue(attn_mask, num_dynamic_mask, dim=-1, keepdim=True).values
+                attn_mask = attn_mask.masked_fill(attn_mask < rate_value, min_type)
         if attention_mask is not None:
             attn_mask = attn_mask.masked_fill(attention_mask[:, :, :, : hidden_states.shape[-2]] == min_type, min_type)
         return attn_mask
         self.act_fn = ACT2FN[config.hidden_act]
         self.expert_retrieval_dim = config.expert_retrieval_size
+        self.num_cdmoe_experts = config.num_cdmoe_experts
+        self.num_cdmoe_heads = config.num_cdmoe_heads
+        self.num_cdmoe_experts_per_head = config.num_cdmoe_experts_per_head
+        self.num_keys = int(math.sqrt(self.num_cdmoe_experts))
         # queries and keys for retrieval experts
+        self.queries = nn.Linear(self.hidden_dim, self.num_cdmoe_heads * self.expert_retrieval_dim, bias=False)
+        self.keys = nn.Parameter(torch.zeros(self.num_cdmoe_heads, self.num_keys, 2, self.expert_retrieval_dim // 2))
         # experts
+        self.down_embed  = nn.Embedding(self.num_cdmoe_experts, self.hidden_dim)
+        self.up_embed = nn.Embedding(self.num_cdmoe_experts, self.hidden_dim)
     def forward(
         self,
         # get similarity with queries and keys
         queries = self.queries(hidden_states)
+        queries = queries.view(bsz, seq_len, 2, self.num_cdmoe_heads, -1).permute(2, 0, 1, 3, 4)
         sim = torch.einsum("p b t h n, h k p n -> p b t h k", queries, self.keys)
         # get experts with the highest similarity
+        (scores_x, scores_y), (indices_x, indices_y) = sim.topk(self.num_cdmoe_experts_per_head, dim=-1)
         if einx_add is not None:
             all_scores = einx_add("... i, ... j -> ... (i j)", scores_x, scores_y)
             all_indices = einx_add("... i, ... j -> ... (i j)", indices_x * self.num_keys, indices_y)
             all_scores = all_scores.view(*scores_x.shape[:-1], -1)
             all_indices = (indices_x.unsqueeze(-1) * self.num_keys) + indices_y.unsqueeze(-2)
             all_indices = all_indices.view(*indices_x.shape[:-1], -1)
+        scores, pk_indices = all_scores.topk(self.num_cdmoe_experts_per_head, dim=-1)
         indices = all_indices.gather(-1, pk_indices)
         down_embed = self.down_embed(indices)
         up_embed = self.up_embed(indices)