Upload DogeForCausalLM
Browse files- config.json +47 -44
- configuration_doge.py +13 -13
- generation_config.json +7 -7
- modeling_doge.py +19 -18
config.json
CHANGED
@@ -1,44 +1,47 @@
|
|
1 |
-
{
|
2 |
-
"_name_or_path": "./results/Doge-60M",
|
3 |
-
"architectures": [
|
4 |
-
"DogeForCausalLM"
|
5 |
-
],
|
6 |
-
"attention_dropout": 0.0,
|
7 |
-
"auto_map": {
|
8 |
-
"AutoConfig": "configuration_doge.DogeConfig",
|
9 |
-
"AutoModelForCausalLM": "modeling_doge.DogeForCausalLM"
|
10 |
-
},
|
11 |
-
"bos_token_id": 0,
|
12 |
-
"dynamic_mask_ratio": 0.0,
|
13 |
-
"eos_token_id": 1,
|
14 |
-
"expert_retrieval_size": 256,
|
15 |
-
"hidden_act": "silu",
|
16 |
-
"hidden_bias": false,
|
17 |
-
"hidden_dropout": 0.0,
|
18 |
-
"hidden_size": 512,
|
19 |
-
"initializer_range": 0.02,
|
20 |
-
"intermediate_size": 1024,
|
21 |
-
"is_moe": false,
|
22 |
-
"max_position_embeddings": 2048,
|
23 |
-
"model_type": "doge",
|
24 |
-
"num_attention_heads": 4,
|
25 |
-
"num_cdmmoe_experts": 2048,
|
26 |
-
"num_cdmmoe_experts_per_head": 8,
|
27 |
-
"num_cdmmoe_heads": 4,
|
28 |
-
"
|
29 |
-
"
|
30 |
-
"
|
31 |
-
"
|
32 |
-
"
|
33 |
-
"
|
34 |
-
"
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
"
|
43 |
-
"
|
44 |
-
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "./results/Doge-60M",
|
3 |
+
"architectures": [
|
4 |
+
"DogeForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_dropout": 0.0,
|
7 |
+
"auto_map": {
|
8 |
+
"AutoConfig": "configuration_doge.DogeConfig",
|
9 |
+
"AutoModelForCausalLM": "modeling_doge.DogeForCausalLM"
|
10 |
+
},
|
11 |
+
"bos_token_id": 0,
|
12 |
+
"dynamic_mask_ratio": 0.0,
|
13 |
+
"eos_token_id": 1,
|
14 |
+
"expert_retrieval_size": 256,
|
15 |
+
"hidden_act": "silu",
|
16 |
+
"hidden_bias": false,
|
17 |
+
"hidden_dropout": 0.0,
|
18 |
+
"hidden_size": 512,
|
19 |
+
"initializer_range": 0.02,
|
20 |
+
"intermediate_size": 1024,
|
21 |
+
"is_moe": false,
|
22 |
+
"max_position_embeddings": 2048,
|
23 |
+
"model_type": "doge",
|
24 |
+
"num_attention_heads": 4,
|
25 |
+
"num_cdmmoe_experts": 2048,
|
26 |
+
"num_cdmmoe_experts_per_head": 8,
|
27 |
+
"num_cdmmoe_heads": 4,
|
28 |
+
"num_cdmoe_experts": 16348,
|
29 |
+
"num_cdmoe_experts_per_head": 8,
|
30 |
+
"num_cdmoe_heads": 4,
|
31 |
+
"num_channels": 3,
|
32 |
+
"num_hidden_layers": 16,
|
33 |
+
"num_key_value_heads": 2,
|
34 |
+
"pad_token_id": 2,
|
35 |
+
"patch_size": 16,
|
36 |
+
"rms_norm_eps": 1e-06,
|
37 |
+
"rope_scaling": {
|
38 |
+
"factor": 4.0,
|
39 |
+
"original_max_position_embeddings": 2048,
|
40 |
+
"rope_type": "dynamic"
|
41 |
+
},
|
42 |
+
"rope_theta": 10000.0,
|
43 |
+
"torch_dtype": "float32",
|
44 |
+
"transformers_version": "4.47.1",
|
45 |
+
"use_cache": true,
|
46 |
+
"vocab_size": 32768
|
47 |
+
}
|
configuration_doge.py
CHANGED
@@ -40,7 +40,7 @@ class DogeConfig(PretrainedConfig):
|
|
40 |
hidden_size (`int`, *optional*, defaults to 1024):
|
41 |
Dimension of the hidden representations.
|
42 |
intermediate_size (`int`, *optional*, defaults to 2048):
|
43 |
-
Dimension of the
|
44 |
num_hidden_layers (`int`, *optional*, defaults to 32):
|
45 |
Number of hidden layers in the Transformer decoder.
|
46 |
hidden_bias (`bool`, *optional*, defaults to `False`):
|
@@ -115,13 +115,13 @@ class DogeConfig(PretrainedConfig):
|
|
115 |
The ratio to control the proportion of the dynamic mask filled with the minimum value.
|
116 |
is_moe (`bool`, *optional*, defaults to `False`):
|
117 |
Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
|
118 |
-
|
119 |
-
Number of Private Experts for the Cross Domain Mixture of Experts.
|
120 |
-
|
121 |
Number of heads of Private Experts for the Cross Domain Mixture of Experts.
|
122 |
-
|
123 |
Number of Private Experts per head for the Cross Domain Mixture of Experts.
|
124 |
-
expert_retrieval_size (`int`, *optional*, defaults to
|
125 |
Dimension of the Expert retrieval states for the Cross Domain Mixture of Experts.
|
126 |
"""
|
127 |
|
@@ -158,10 +158,10 @@ class DogeConfig(PretrainedConfig):
|
|
158 |
attention_dropout=0.0,
|
159 |
dynamic_mask_ratio=0.0,
|
160 |
is_moe=False,
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
expert_retrieval_size=
|
165 |
**kwargs,
|
166 |
):
|
167 |
self.vocab_size = vocab_size
|
@@ -188,9 +188,9 @@ class DogeConfig(PretrainedConfig):
|
|
188 |
self.attention_dropout = attention_dropout
|
189 |
self.dynamic_mask_ratio = dynamic_mask_ratio
|
190 |
self.is_moe = is_moe
|
191 |
-
self.
|
192 |
-
self.
|
193 |
-
self.
|
194 |
self.expert_retrieval_size = expert_retrieval_size
|
195 |
|
196 |
# Validate the correctness of rotary position embeddings parameters
|
|
|
40 |
hidden_size (`int`, *optional*, defaults to 1024):
|
41 |
Dimension of the hidden representations.
|
42 |
intermediate_size (`int`, *optional*, defaults to 2048):
|
43 |
+
Dimension of the MLP representations.
|
44 |
num_hidden_layers (`int`, *optional*, defaults to 32):
|
45 |
Number of hidden layers in the Transformer decoder.
|
46 |
hidden_bias (`bool`, *optional*, defaults to `False`):
|
|
|
115 |
The ratio to control the proportion of the dynamic mask filled with the minimum value.
|
116 |
is_moe (`bool`, *optional*, defaults to `False`):
|
117 |
Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
|
118 |
+
num_cdmoe_experts (`int`, *optional*, defaults to 16348):
|
119 |
+
Number of Private Experts for the Cross Domain Mixture of Experts. calculation formula: :math:`\text{num_cdmoe_experts} = (32 \times \text{num_cdmoe_heads})^2`
|
120 |
+
num_cdmoe_heads (`int`, *optional*, defaults to 4):
|
121 |
Number of heads of Private Experts for the Cross Domain Mixture of Experts.
|
122 |
+
num_cdmoe_experts_per_head (`int`, *optional*, defaults to 8):
|
123 |
Number of Private Experts per head for the Cross Domain Mixture of Experts.
|
124 |
+
expert_retrieval_size (`int`, *optional*, defaults to 64):
|
125 |
Dimension of the Expert retrieval states for the Cross Domain Mixture of Experts.
|
126 |
"""
|
127 |
|
|
|
158 |
attention_dropout=0.0,
|
159 |
dynamic_mask_ratio=0.0,
|
160 |
is_moe=False,
|
161 |
+
num_cdmoe_experts=16348,
|
162 |
+
num_cdmoe_heads=4,
|
163 |
+
num_cdmoe_experts_per_head=8,
|
164 |
+
expert_retrieval_size=64,
|
165 |
**kwargs,
|
166 |
):
|
167 |
self.vocab_size = vocab_size
|
|
|
188 |
self.attention_dropout = attention_dropout
|
189 |
self.dynamic_mask_ratio = dynamic_mask_ratio
|
190 |
self.is_moe = is_moe
|
191 |
+
self.num_cdmoe_experts = num_cdmoe_experts
|
192 |
+
self.num_cdmoe_heads = num_cdmoe_heads
|
193 |
+
self.num_cdmoe_experts_per_head = num_cdmoe_experts_per_head
|
194 |
self.expert_retrieval_size = expert_retrieval_size
|
195 |
|
196 |
# Validate the correctness of rotary position embeddings parameters
|
generation_config.json
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
-
{
|
2 |
-
"_from_model_config": true,
|
3 |
-
"bos_token_id": 0,
|
4 |
-
"eos_token_id": 1,
|
5 |
-
"pad_token_id": 2,
|
6 |
-
"transformers_version": "4.
|
7 |
-
}
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 0,
|
4 |
+
"eos_token_id": 1,
|
5 |
+
"pad_token_id": 2,
|
6 |
+
"transformers_version": "4.47.1"
|
7 |
+
}
|
modeling_doge.py
CHANGED
@@ -22,7 +22,6 @@ import math
|
|
22 |
from typing import List, Optional, Tuple, Union
|
23 |
|
24 |
import torch
|
25 |
-
from torch.nn.attention.flex_attention import flex_attention
|
26 |
import torch.nn.functional as F
|
27 |
import torch.utils.checkpoint
|
28 |
from torch import nn
|
@@ -40,6 +39,7 @@ from transformers.modeling_utils import PreTrainedModel
|
|
40 |
from transformers.utils import (
|
41 |
add_start_docstrings,
|
42 |
add_start_docstrings_to_model_forward,
|
|
|
43 |
logging,
|
44 |
replace_return_docstrings,
|
45 |
)
|
@@ -50,6 +50,9 @@ try:
|
|
50 |
except ImportError:
|
51 |
einx_add = None
|
52 |
|
|
|
|
|
|
|
53 |
|
54 |
logger = logging.get_logger(__name__)
|
55 |
|
@@ -308,12 +311,10 @@ class DogeDynamicMaskAttention(nn.Module):
|
|
308 |
min_type = torch.finfo(hidden_states.dtype).min
|
309 |
attn_mask = dynamic_mask[:, :, None, :]
|
310 |
if 0.0 < dynamic_mask_ratio < 1.0:
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
).values
|
316 |
-
attn_mask = attn_mask.masked_fill(attn_mask < rate_value, min_type)
|
317 |
if attention_mask is not None:
|
318 |
attn_mask = attn_mask.masked_fill(attention_mask[:, :, :, : hidden_states.shape[-2]] == min_type, min_type)
|
319 |
return attn_mask
|
@@ -479,18 +480,18 @@ class DogeCDMoE(DogeMLP):
|
|
479 |
self.act_fn = ACT2FN[config.hidden_act]
|
480 |
|
481 |
self.expert_retrieval_dim = config.expert_retrieval_size
|
482 |
-
self.
|
483 |
-
self.
|
484 |
-
self.
|
485 |
-
self.num_keys = int(math.sqrt(self.
|
486 |
|
487 |
# queries and keys for retrieval experts
|
488 |
-
self.queries = nn.Linear(self.hidden_dim, self.
|
489 |
-
self.keys = nn.Parameter(torch.zeros(self.
|
490 |
|
491 |
# experts
|
492 |
-
self.down_embed = nn.Embedding(self.
|
493 |
-
self.up_embed = nn.Embedding(self.
|
494 |
|
495 |
def forward(
|
496 |
self,
|
@@ -501,11 +502,11 @@ class DogeCDMoE(DogeMLP):
|
|
501 |
|
502 |
# get similarity with queries and keys
|
503 |
queries = self.queries(hidden_states)
|
504 |
-
queries = queries.view(bsz, seq_len, 2, self.
|
505 |
sim = torch.einsum("p b t h n, h k p n -> p b t h k", queries, self.keys)
|
506 |
|
507 |
# get experts with the highest similarity
|
508 |
-
(scores_x, scores_y), (indices_x, indices_y) = sim.topk(self.
|
509 |
if einx_add is not None:
|
510 |
all_scores = einx_add("... i, ... j -> ... (i j)", scores_x, scores_y)
|
511 |
all_indices = einx_add("... i, ... j -> ... (i j)", indices_x * self.num_keys, indices_y)
|
@@ -514,7 +515,7 @@ class DogeCDMoE(DogeMLP):
|
|
514 |
all_scores = all_scores.view(*scores_x.shape[:-1], -1)
|
515 |
all_indices = (indices_x.unsqueeze(-1) * self.num_keys) + indices_y.unsqueeze(-2)
|
516 |
all_indices = all_indices.view(*indices_x.shape[:-1], -1)
|
517 |
-
scores, pk_indices = all_scores.topk(self.
|
518 |
indices = all_indices.gather(-1, pk_indices)
|
519 |
down_embed = self.down_embed(indices)
|
520 |
up_embed = self.up_embed(indices)
|
|
|
22 |
from typing import List, Optional, Tuple, Union
|
23 |
|
24 |
import torch
|
|
|
25 |
import torch.nn.functional as F
|
26 |
import torch.utils.checkpoint
|
27 |
from torch import nn
|
|
|
39 |
from transformers.utils import (
|
40 |
add_start_docstrings,
|
41 |
add_start_docstrings_to_model_forward,
|
42 |
+
is_torch_greater_or_equal,
|
43 |
logging,
|
44 |
replace_return_docstrings,
|
45 |
)
|
|
|
50 |
except ImportError:
|
51 |
einx_add = None
|
52 |
|
53 |
+
if is_torch_greater_or_equal("2.5"):
|
54 |
+
from torch.nn.attention.flex_attention import flex_attention
|
55 |
+
|
56 |
|
57 |
logger = logging.get_logger(__name__)
|
58 |
|
|
|
311 |
min_type = torch.finfo(hidden_states.dtype).min
|
312 |
attn_mask = dynamic_mask[:, :, None, :]
|
313 |
if 0.0 < dynamic_mask_ratio < 1.0:
|
314 |
+
num_dynamic_mask = int(attn_mask.shape[-1] * dynamic_mask_ratio)
|
315 |
+
if num_dynamic_mask > 0:
|
316 |
+
rate_value = torch.kthvalue(attn_mask, num_dynamic_mask, dim=-1, keepdim=True).values
|
317 |
+
attn_mask = attn_mask.masked_fill(attn_mask < rate_value, min_type)
|
|
|
|
|
318 |
if attention_mask is not None:
|
319 |
attn_mask = attn_mask.masked_fill(attention_mask[:, :, :, : hidden_states.shape[-2]] == min_type, min_type)
|
320 |
return attn_mask
|
|
|
480 |
self.act_fn = ACT2FN[config.hidden_act]
|
481 |
|
482 |
self.expert_retrieval_dim = config.expert_retrieval_size
|
483 |
+
self.num_cdmoe_experts = config.num_cdmoe_experts
|
484 |
+
self.num_cdmoe_heads = config.num_cdmoe_heads
|
485 |
+
self.num_cdmoe_experts_per_head = config.num_cdmoe_experts_per_head
|
486 |
+
self.num_keys = int(math.sqrt(self.num_cdmoe_experts))
|
487 |
|
488 |
# queries and keys for retrieval experts
|
489 |
+
self.queries = nn.Linear(self.hidden_dim, self.num_cdmoe_heads * self.expert_retrieval_dim, bias=False)
|
490 |
+
self.keys = nn.Parameter(torch.zeros(self.num_cdmoe_heads, self.num_keys, 2, self.expert_retrieval_dim // 2))
|
491 |
|
492 |
# experts
|
493 |
+
self.down_embed = nn.Embedding(self.num_cdmoe_experts, self.hidden_dim)
|
494 |
+
self.up_embed = nn.Embedding(self.num_cdmoe_experts, self.hidden_dim)
|
495 |
|
496 |
def forward(
|
497 |
self,
|
|
|
502 |
|
503 |
# get similarity with queries and keys
|
504 |
queries = self.queries(hidden_states)
|
505 |
+
queries = queries.view(bsz, seq_len, 2, self.num_cdmoe_heads, -1).permute(2, 0, 1, 3, 4)
|
506 |
sim = torch.einsum("p b t h n, h k p n -> p b t h k", queries, self.keys)
|
507 |
|
508 |
# get experts with the highest similarity
|
509 |
+
(scores_x, scores_y), (indices_x, indices_y) = sim.topk(self.num_cdmoe_experts_per_head, dim=-1)
|
510 |
if einx_add is not None:
|
511 |
all_scores = einx_add("... i, ... j -> ... (i j)", scores_x, scores_y)
|
512 |
all_indices = einx_add("... i, ... j -> ... (i j)", indices_x * self.num_keys, indices_y)
|
|
|
515 |
all_scores = all_scores.view(*scores_x.shape[:-1], -1)
|
516 |
all_indices = (indices_x.unsqueeze(-1) * self.num_keys) + indices_y.unsqueeze(-2)
|
517 |
all_indices = all_indices.view(*indices_x.shape[:-1], -1)
|
518 |
+
scores, pk_indices = all_scores.topk(self.num_cdmoe_experts_per_head, dim=-1)
|
519 |
indices = all_indices.gather(-1, pk_indices)
|
520 |
down_embed = self.down_embed(indices)
|
521 |
up_embed = self.up_embed(indices)
|