Crystalcareai
commited on
Update configuration_gemmoe.py
Browse files- configuration_gemmoe.py +0 -44
configuration_gemmoe.py
CHANGED
@@ -92,30 +92,6 @@ class GemmoeConfig(PretrainedConfig):
|
|
92 |
output_router_logits (`bool`, *optional*, defaults to `False`):
|
93 |
Whether or not to output the logits of the routers. They are useful for computing the router loss, and
|
94 |
should not be returned during inference.
|
95 |
-
n_shared_experts (`int`, *optional*, defaults to `None`):
|
96 |
-
The number of shared experts used in the sparse mixture of experts layer. If set to `None`, no shared
|
97 |
-
experts are used.
|
98 |
-
n_routed_experts (`int`, *optional*, defaults to `None`):
|
99 |
-
The number of routed experts used in the sparse mixture of experts layer. If set to `None`, all experts are
|
100 |
-
routed experts.
|
101 |
-
moe_layer_freq (`int`, *optional*, defaults to 1):
|
102 |
-
The frequency of MoE layers in the model. A value of 1 means MoE layers are used in every layer, a value of
|
103 |
-
2 means MoE layers are used in every other layer, and so on.
|
104 |
-
first_k_dense_replace (`int`, *optional*, defaults to 0):
|
105 |
-
The number of initial dense layers to replace with MoE layers. If set to 0 (default), no dense layers are
|
106 |
-
replaced.
|
107 |
-
norm_topk_prob (`bool`, *optional*, defaults to `False`):
|
108 |
-
Whether to normalize the top-k probabilities of the router during training.
|
109 |
-
scoring_func (`str`, *optional*, defaults to `'softmax'`):
|
110 |
-
The scoring function used by the router. Can be 'softmax' or 'remap'.
|
111 |
-
aux_loss_alpha (`float`, *optional*, defaults to 0.001):
|
112 |
-
The weight of the auxiliary loss used for training the router.
|
113 |
-
seq_aux (`bool`, *optional*, defaults to `True`):
|
114 |
-
Whether to use sequence-level auxiliary loss for training the router.
|
115 |
-
pretraining_tp (`int`, *optional*, defaults to 1):
|
116 |
-
The tensor parallelism used for pretraining.
|
117 |
-
rope_scaling (`float`, *optional*, defaults to `None`):
|
118 |
-
The scaling factor for the Rotary Position Embedding (RoPE). If set to `None`, no scaling is applied.
|
119 |
|
120 |
```python
|
121 |
>>> from transformers import GemmoeModel, GemmoeConfig
|
@@ -156,16 +132,6 @@ class GemmoeConfig(PretrainedConfig):
|
|
156 |
attention_dropout=0.0,
|
157 |
num_experts_per_tok=2,
|
158 |
num_local_experts=8,
|
159 |
-
n_shared_experts=8,
|
160 |
-
n_routed_experts=2,
|
161 |
-
moe_layer_freq=1,
|
162 |
-
first_k_dense_replace=0,
|
163 |
-
norm_topk_prob=False,
|
164 |
-
scoring_func='softmax',
|
165 |
-
aux_loss_alpha=0.001,
|
166 |
-
seq_aux=True,
|
167 |
-
pretraining_tp=1,
|
168 |
-
rope_scaling=None,
|
169 |
router_aux_loss_coef=0.02,
|
170 |
output_router_logits=False,
|
171 |
**kwargs,
|
@@ -187,16 +153,6 @@ class GemmoeConfig(PretrainedConfig):
|
|
187 |
self.attention_dropout = attention_dropout
|
188 |
self.num_experts_per_tok = num_experts_per_tok
|
189 |
self.num_local_experts = num_local_experts
|
190 |
-
self.n_shared_experts = n_shared_experts
|
191 |
-
self.n_routed_experts = n_routed_experts
|
192 |
-
self.moe_layer_freq = moe_layer_freq
|
193 |
-
self.first_k_dense_replace = first_k_dense_replace
|
194 |
-
self.norm_topk_prob = norm_topk_prob
|
195 |
-
self.scoring_func = scoring_func
|
196 |
-
self.aux_loss_alpha = aux_loss_alpha
|
197 |
-
self.seq_aux = seq_aux
|
198 |
-
self.pretraining_tp = pretraining_tp
|
199 |
-
self.rope_scaling = rope_scaling
|
200 |
self.router_aux_loss_coef = router_aux_loss_coef
|
201 |
self.output_router_logits = output_router_logits
|
202 |
|
|
|
92 |
output_router_logits (`bool`, *optional*, defaults to `False`):
|
93 |
Whether or not to output the logits of the routers. They are useful for computing the router loss, and
|
94 |
should not be returned during inference.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
```python
|
97 |
>>> from transformers import GemmoeModel, GemmoeConfig
|
|
|
132 |
attention_dropout=0.0,
|
133 |
num_experts_per_tok=2,
|
134 |
num_local_experts=8,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
router_aux_loss_coef=0.02,
|
136 |
output_router_logits=False,
|
137 |
**kwargs,
|
|
|
153 |
self.attention_dropout = attention_dropout
|
154 |
self.num_experts_per_tok = num_experts_per_tok
|
155 |
self.num_local_experts = num_local_experts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
self.router_aux_loss_coef = router_aux_loss_coef
|
157 |
self.output_router_logits = output_router_logits
|
158 |
|