Text Generation
Transformers
Safetensors
English
doge
conversational
custom_code
JingzeShi commited on
Commit
438c141
·
verified ·
1 Parent(s): 44e481b

Upload DogeForCausalLM

Browse files
Files changed (4) hide show
  1. config.json +47 -44
  2. configuration_doge.py +13 -13
  3. generation_config.json +7 -7
  4. modeling_doge.py +19 -18
config.json CHANGED
@@ -1,44 +1,47 @@
1
- {
2
- "_name_or_path": "./results/Doge-60M",
3
- "architectures": [
4
- "DogeForCausalLM"
5
- ],
6
- "attention_dropout": 0.0,
7
- "auto_map": {
8
- "AutoConfig": "configuration_doge.DogeConfig",
9
- "AutoModelForCausalLM": "modeling_doge.DogeForCausalLM"
10
- },
11
- "bos_token_id": 0,
12
- "dynamic_mask_ratio": 0.0,
13
- "eos_token_id": 1,
14
- "expert_retrieval_size": 256,
15
- "hidden_act": "silu",
16
- "hidden_bias": false,
17
- "hidden_dropout": 0.0,
18
- "hidden_size": 512,
19
- "initializer_range": 0.02,
20
- "intermediate_size": 1024,
21
- "is_moe": false,
22
- "max_position_embeddings": 2048,
23
- "model_type": "doge",
24
- "num_attention_heads": 4,
25
- "num_cdmmoe_experts": 2048,
26
- "num_cdmmoe_experts_per_head": 8,
27
- "num_cdmmoe_heads": 4,
28
- "num_channels": 3,
29
- "num_hidden_layers": 16,
30
- "num_key_value_heads": 2,
31
- "pad_token_id": 2,
32
- "patch_size": 16,
33
- "rms_norm_eps": 1e-06,
34
- "rope_scaling": {
35
- "factor": 4.0,
36
- "original_max_position_embeddings": 2048,
37
- "rope_type": "dynamic"
38
- },
39
- "rope_theta": 10000.0,
40
- "torch_dtype": "float32",
41
- "transformers_version": "4.49.0.dev0",
42
- "use_cache": true,
43
- "vocab_size": 32768
44
- }
 
 
 
 
1
+ {
2
+ "_name_or_path": "./results/Doge-60M",
3
+ "architectures": [
4
+ "DogeForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_doge.DogeConfig",
9
+ "AutoModelForCausalLM": "modeling_doge.DogeForCausalLM"
10
+ },
11
+ "bos_token_id": 0,
12
+ "dynamic_mask_ratio": 0.0,
13
+ "eos_token_id": 1,
14
+ "expert_retrieval_size": 256,
15
+ "hidden_act": "silu",
16
+ "hidden_bias": false,
17
+ "hidden_dropout": 0.0,
18
+ "hidden_size": 512,
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 1024,
21
+ "is_moe": false,
22
+ "max_position_embeddings": 2048,
23
+ "model_type": "doge",
24
+ "num_attention_heads": 4,
25
+ "num_cdmmoe_experts": 2048,
26
+ "num_cdmmoe_experts_per_head": 8,
27
+ "num_cdmmoe_heads": 4,
28
+ "num_cdmoe_experts": 16348,
29
+ "num_cdmoe_experts_per_head": 8,
30
+ "num_cdmoe_heads": 4,
31
+ "num_channels": 3,
32
+ "num_hidden_layers": 16,
33
+ "num_key_value_heads": 2,
34
+ "pad_token_id": 2,
35
+ "patch_size": 16,
36
+ "rms_norm_eps": 1e-06,
37
+ "rope_scaling": {
38
+ "factor": 4.0,
39
+ "original_max_position_embeddings": 2048,
40
+ "rope_type": "dynamic"
41
+ },
42
+ "rope_theta": 10000.0,
43
+ "torch_dtype": "float32",
44
+ "transformers_version": "4.47.1",
45
+ "use_cache": true,
46
+ "vocab_size": 32768
47
+ }
configuration_doge.py CHANGED
@@ -40,7 +40,7 @@ class DogeConfig(PretrainedConfig):
40
  hidden_size (`int`, *optional*, defaults to 1024):
41
  Dimension of the hidden representations.
42
  intermediate_size (`int`, *optional*, defaults to 2048):
43
- Dimension of the CDMoE representations.
44
  num_hidden_layers (`int`, *optional*, defaults to 32):
45
  Number of hidden layers in the Transformer decoder.
46
  hidden_bias (`bool`, *optional*, defaults to `False`):
@@ -115,13 +115,13 @@ class DogeConfig(PretrainedConfig):
115
  The ratio to control the proportion of the dynamic mask filled with the minimum value.
116
  is_moe (`bool`, *optional*, defaults to `False`):
117
  Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
118
- num_cdmmoe_experts (`int`, *optional*, defaults to 2048):
119
- Number of Private Experts for the Cross Domain Mixture of Experts.
120
- num_cdmmoe_heads (`int`, *optional*, defaults to 4):
121
  Number of heads of Private Experts for the Cross Domain Mixture of Experts.
122
- num_cdmmoe_experts_per_head (`int`, *optional*, defaults to 8):
123
  Number of Private Experts per head for the Cross Domain Mixture of Experts.
124
- expert_retrieval_size (`int`, *optional*, defaults to 256):
125
  Dimension of the Expert retrieval states for the Cross Domain Mixture of Experts.
126
  """
127
 
@@ -158,10 +158,10 @@ class DogeConfig(PretrainedConfig):
158
  attention_dropout=0.0,
159
  dynamic_mask_ratio=0.0,
160
  is_moe=False,
161
- num_cdmmoe_experts=2048,
162
- num_cdmmoe_heads=4,
163
- num_cdmmoe_experts_per_head=8,
164
- expert_retrieval_size=256,
165
  **kwargs,
166
  ):
167
  self.vocab_size = vocab_size
@@ -188,9 +188,9 @@ class DogeConfig(PretrainedConfig):
188
  self.attention_dropout = attention_dropout
189
  self.dynamic_mask_ratio = dynamic_mask_ratio
190
  self.is_moe = is_moe
191
- self.num_cdmmoe_experts = num_cdmmoe_experts
192
- self.num_cdmmoe_heads = num_cdmmoe_heads
193
- self.num_cdmmoe_experts_per_head = num_cdmmoe_experts_per_head
194
  self.expert_retrieval_size = expert_retrieval_size
195
 
196
  # Validate the correctness of rotary position embeddings parameters
 
40
  hidden_size (`int`, *optional*, defaults to 1024):
41
  Dimension of the hidden representations.
42
  intermediate_size (`int`, *optional*, defaults to 2048):
43
+ Dimension of the MLP representations.
44
  num_hidden_layers (`int`, *optional*, defaults to 32):
45
  Number of hidden layers in the Transformer decoder.
46
  hidden_bias (`bool`, *optional*, defaults to `False`):
 
115
  The ratio to control the proportion of the dynamic mask filled with the minimum value.
116
  is_moe (`bool`, *optional*, defaults to `False`):
117
  Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
118
+ num_cdmoe_experts (`int`, *optional*, defaults to 16348):
119
+ Number of Private Experts for the Cross Domain Mixture of Experts. calculation formula: :math:`\text{num_cdmoe_experts} = (32 \times \text{num_cdmoe_heads})^2`
120
+ num_cdmoe_heads (`int`, *optional*, defaults to 4):
121
  Number of heads of Private Experts for the Cross Domain Mixture of Experts.
122
+ num_cdmoe_experts_per_head (`int`, *optional*, defaults to 8):
123
  Number of Private Experts per head for the Cross Domain Mixture of Experts.
124
+ expert_retrieval_size (`int`, *optional*, defaults to 64):
125
  Dimension of the Expert retrieval states for the Cross Domain Mixture of Experts.
126
  """
127
 
 
158
  attention_dropout=0.0,
159
  dynamic_mask_ratio=0.0,
160
  is_moe=False,
161
+ num_cdmoe_experts=16348,
162
+ num_cdmoe_heads=4,
163
+ num_cdmoe_experts_per_head=8,
164
+ expert_retrieval_size=64,
165
  **kwargs,
166
  ):
167
  self.vocab_size = vocab_size
 
188
  self.attention_dropout = attention_dropout
189
  self.dynamic_mask_ratio = dynamic_mask_ratio
190
  self.is_moe = is_moe
191
+ self.num_cdmoe_experts = num_cdmoe_experts
192
+ self.num_cdmoe_heads = num_cdmoe_heads
193
+ self.num_cdmoe_experts_per_head = num_cdmoe_experts_per_head
194
  self.expert_retrieval_size = expert_retrieval_size
195
 
196
  # Validate the correctness of rotary position embeddings parameters
generation_config.json CHANGED
@@ -1,7 +1,7 @@
1
- {
2
- "_from_model_config": true,
3
- "bos_token_id": 0,
4
- "eos_token_id": 1,
5
- "pad_token_id": 2,
6
- "transformers_version": "4.49.0.dev0"
7
- }
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.47.1"
7
+ }
modeling_doge.py CHANGED
@@ -22,7 +22,6 @@ import math
22
  from typing import List, Optional, Tuple, Union
23
 
24
  import torch
25
- from torch.nn.attention.flex_attention import flex_attention
26
  import torch.nn.functional as F
27
  import torch.utils.checkpoint
28
  from torch import nn
@@ -40,6 +39,7 @@ from transformers.modeling_utils import PreTrainedModel
40
  from transformers.utils import (
41
  add_start_docstrings,
42
  add_start_docstrings_to_model_forward,
 
43
  logging,
44
  replace_return_docstrings,
45
  )
@@ -50,6 +50,9 @@ try:
50
  except ImportError:
51
  einx_add = None
52
 
 
 
 
53
 
54
  logger = logging.get_logger(__name__)
55
 
@@ -308,12 +311,10 @@ class DogeDynamicMaskAttention(nn.Module):
308
  min_type = torch.finfo(hidden_states.dtype).min
309
  attn_mask = dynamic_mask[:, :, None, :]
310
  if 0.0 < dynamic_mask_ratio < 1.0:
311
- rate_value = torch.kthvalue(
312
- attn_mask,
313
- int(attn_mask.shape[-1] * dynamic_mask_ratio),
314
- dim=-1, keepdim=True,
315
- ).values
316
- attn_mask = attn_mask.masked_fill(attn_mask < rate_value, min_type)
317
  if attention_mask is not None:
318
  attn_mask = attn_mask.masked_fill(attention_mask[:, :, :, : hidden_states.shape[-2]] == min_type, min_type)
319
  return attn_mask
@@ -479,18 +480,18 @@ class DogeCDMoE(DogeMLP):
479
  self.act_fn = ACT2FN[config.hidden_act]
480
 
481
  self.expert_retrieval_dim = config.expert_retrieval_size
482
- self.num_cdmmoe_experts = config.num_cdmmoe_experts
483
- self.num_cdmmoe_heads = config.num_cdmmoe_heads
484
- self.num_cdmmoe_experts_per_head = config.num_cdmmoe_experts_per_head
485
- self.num_keys = int(math.sqrt(self.num_cdmmoe_experts))
486
 
487
  # queries and keys for retrieval experts
488
- self.queries = nn.Linear(self.hidden_dim, self.num_cdmmoe_heads * self.expert_retrieval_dim, bias=False)
489
- self.keys = nn.Parameter(torch.zeros(self.num_cdmmoe_heads, self.num_keys, 2, self.expert_retrieval_dim // 2))
490
 
491
  # experts
492
- self.down_embed = nn.Embedding(self.num_cdmmoe_experts, self.hidden_dim)
493
- self.up_embed = nn.Embedding(self.num_cdmmoe_experts, self.hidden_dim)
494
 
495
  def forward(
496
  self,
@@ -501,11 +502,11 @@ class DogeCDMoE(DogeMLP):
501
 
502
  # get similarity with queries and keys
503
  queries = self.queries(hidden_states)
504
- queries = queries.view(bsz, seq_len, 2, self.num_cdmmoe_heads, -1).permute(2, 0, 1, 3, 4)
505
  sim = torch.einsum("p b t h n, h k p n -> p b t h k", queries, self.keys)
506
 
507
  # get experts with the highest similarity
508
- (scores_x, scores_y), (indices_x, indices_y) = sim.topk(self.num_cdmmoe_experts_per_head, dim=-1)
509
  if einx_add is not None:
510
  all_scores = einx_add("... i, ... j -> ... (i j)", scores_x, scores_y)
511
  all_indices = einx_add("... i, ... j -> ... (i j)", indices_x * self.num_keys, indices_y)
@@ -514,7 +515,7 @@ class DogeCDMoE(DogeMLP):
514
  all_scores = all_scores.view(*scores_x.shape[:-1], -1)
515
  all_indices = (indices_x.unsqueeze(-1) * self.num_keys) + indices_y.unsqueeze(-2)
516
  all_indices = all_indices.view(*indices_x.shape[:-1], -1)
517
- scores, pk_indices = all_scores.topk(self.num_cdmmoe_experts_per_head, dim=-1)
518
  indices = all_indices.gather(-1, pk_indices)
519
  down_embed = self.down_embed(indices)
520
  up_embed = self.up_embed(indices)
 
22
  from typing import List, Optional, Tuple, Union
23
 
24
  import torch
 
25
  import torch.nn.functional as F
26
  import torch.utils.checkpoint
27
  from torch import nn
 
39
  from transformers.utils import (
40
  add_start_docstrings,
41
  add_start_docstrings_to_model_forward,
42
+ is_torch_greater_or_equal,
43
  logging,
44
  replace_return_docstrings,
45
  )
 
50
  except ImportError:
51
  einx_add = None
52
 
53
+ if is_torch_greater_or_equal("2.5"):
54
+ from torch.nn.attention.flex_attention import flex_attention
55
+
56
 
57
  logger = logging.get_logger(__name__)
58
 
 
311
  min_type = torch.finfo(hidden_states.dtype).min
312
  attn_mask = dynamic_mask[:, :, None, :]
313
  if 0.0 < dynamic_mask_ratio < 1.0:
314
+ num_dynamic_mask = int(attn_mask.shape[-1] * dynamic_mask_ratio)
315
+ if num_dynamic_mask > 0:
316
+ rate_value = torch.kthvalue(attn_mask, num_dynamic_mask, dim=-1, keepdim=True).values
317
+ attn_mask = attn_mask.masked_fill(attn_mask < rate_value, min_type)
 
 
318
  if attention_mask is not None:
319
  attn_mask = attn_mask.masked_fill(attention_mask[:, :, :, : hidden_states.shape[-2]] == min_type, min_type)
320
  return attn_mask
 
480
  self.act_fn = ACT2FN[config.hidden_act]
481
 
482
  self.expert_retrieval_dim = config.expert_retrieval_size
483
+ self.num_cdmoe_experts = config.num_cdmoe_experts
484
+ self.num_cdmoe_heads = config.num_cdmoe_heads
485
+ self.num_cdmoe_experts_per_head = config.num_cdmoe_experts_per_head
486
+ self.num_keys = int(math.sqrt(self.num_cdmoe_experts))
487
 
488
  # queries and keys for retrieval experts
489
+ self.queries = nn.Linear(self.hidden_dim, self.num_cdmoe_heads * self.expert_retrieval_dim, bias=False)
490
+ self.keys = nn.Parameter(torch.zeros(self.num_cdmoe_heads, self.num_keys, 2, self.expert_retrieval_dim // 2))
491
 
492
  # experts
493
+ self.down_embed = nn.Embedding(self.num_cdmoe_experts, self.hidden_dim)
494
+ self.up_embed = nn.Embedding(self.num_cdmoe_experts, self.hidden_dim)
495
 
496
  def forward(
497
  self,
 
502
 
503
  # get similarity with queries and keys
504
  queries = self.queries(hidden_states)
505
+ queries = queries.view(bsz, seq_len, 2, self.num_cdmoe_heads, -1).permute(2, 0, 1, 3, 4)
506
  sim = torch.einsum("p b t h n, h k p n -> p b t h k", queries, self.keys)
507
 
508
  # get experts with the highest similarity
509
+ (scores_x, scores_y), (indices_x, indices_y) = sim.topk(self.num_cdmoe_experts_per_head, dim=-1)
510
  if einx_add is not None:
511
  all_scores = einx_add("... i, ... j -> ... (i j)", scores_x, scores_y)
512
  all_indices = einx_add("... i, ... j -> ... (i j)", indices_x * self.num_keys, indices_y)
 
515
  all_scores = all_scores.view(*scores_x.shape[:-1], -1)
516
  all_indices = (indices_x.unsqueeze(-1) * self.num_keys) + indices_y.unsqueeze(-2)
517
  all_indices = all_indices.view(*indices_x.shape[:-1], -1)
518
+ scores, pk_indices = all_scores.topk(self.num_cdmoe_experts_per_head, dim=-1)
519
  indices = all_indices.gather(-1, pk_indices)
520
  down_embed = self.down_embed(indices)
521
  up_embed = self.up_embed(indices)