Crystalcareai commited on
Commit
2e51e15
·
verified ·
1 Parent(s): 292a484

Update configuration_gemmoe.py

Browse files
Files changed (1) hide show
  1. configuration_gemmoe.py +0 -44
configuration_gemmoe.py CHANGED
@@ -92,30 +92,6 @@ class GemmoeConfig(PretrainedConfig):
92
  output_router_logits (`bool`, *optional*, defaults to `False`):
93
  Whether or not to output the logits of the routers. They are useful for computing the router loss, and
94
  should not be returned during inference.
95
- n_shared_experts (`int`, *optional*, defaults to `None`):
96
- The number of shared experts used in the sparse mixture of experts layer. If set to `None`, no shared
97
- experts are used.
98
- n_routed_experts (`int`, *optional*, defaults to `None`):
99
- The number of routed experts used in the sparse mixture of experts layer. If set to `None`, all experts are
100
- routed experts.
101
- moe_layer_freq (`int`, *optional*, defaults to 1):
102
- The frequency of MoE layers in the model. A value of 1 means MoE layers are used in every layer, a value of
103
- 2 means MoE layers are used in every other layer, and so on.
104
- first_k_dense_replace (`int`, *optional*, defaults to 0):
105
- The number of initial dense layers to replace with MoE layers. If set to 0 (default), no dense layers are
106
- replaced.
107
- norm_topk_prob (`bool`, *optional*, defaults to `False`):
108
- Whether to normalize the top-k probabilities of the router during training.
109
- scoring_func (`str`, *optional*, defaults to `'softmax'`):
110
- The scoring function used by the router. Can be 'softmax' or 'remap'.
111
- aux_loss_alpha (`float`, *optional*, defaults to 0.001):
112
- The weight of the auxiliary loss used for training the router.
113
- seq_aux (`bool`, *optional*, defaults to `True`):
114
- Whether to use sequence-level auxiliary loss for training the router.
115
- pretraining_tp (`int`, *optional*, defaults to 1):
116
- The tensor parallelism used for pretraining.
117
- rope_scaling (`float`, *optional*, defaults to `None`):
118
- The scaling factor for the Rotary Position Embedding (RoPE). If set to `None`, no scaling is applied.
119
 
120
  ```python
121
  >>> from transformers import GemmoeModel, GemmoeConfig
@@ -156,16 +132,6 @@ class GemmoeConfig(PretrainedConfig):
156
  attention_dropout=0.0,
157
  num_experts_per_tok=2,
158
  num_local_experts=8,
159
- n_shared_experts=8,
160
- n_routed_experts=2,
161
- moe_layer_freq=1,
162
- first_k_dense_replace=0,
163
- norm_topk_prob=False,
164
- scoring_func='softmax',
165
- aux_loss_alpha=0.001,
166
- seq_aux=True,
167
- pretraining_tp=1,
168
- rope_scaling=None,
169
  router_aux_loss_coef=0.02,
170
  output_router_logits=False,
171
  **kwargs,
@@ -187,16 +153,6 @@ class GemmoeConfig(PretrainedConfig):
187
  self.attention_dropout = attention_dropout
188
  self.num_experts_per_tok = num_experts_per_tok
189
  self.num_local_experts = num_local_experts
190
- self.n_shared_experts = n_shared_experts
191
- self.n_routed_experts = n_routed_experts
192
- self.moe_layer_freq = moe_layer_freq
193
- self.first_k_dense_replace = first_k_dense_replace
194
- self.norm_topk_prob = norm_topk_prob
195
- self.scoring_func = scoring_func
196
- self.aux_loss_alpha = aux_loss_alpha
197
- self.seq_aux = seq_aux
198
- self.pretraining_tp = pretraining_tp
199
- self.rope_scaling = rope_scaling
200
  self.router_aux_loss_coef = router_aux_loss_coef
201
  self.output_router_logits = output_router_logits
202
 
 
92
  output_router_logits (`bool`, *optional*, defaults to `False`):
93
  Whether or not to output the logits of the routers. They are useful for computing the router loss, and
94
  should not be returned during inference.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  ```python
97
  >>> from transformers import GemmoeModel, GemmoeConfig
 
132
  attention_dropout=0.0,
133
  num_experts_per_tok=2,
134
  num_local_experts=8,
 
 
 
 
 
 
 
 
 
 
135
  router_aux_loss_coef=0.02,
136
  output_router_logits=False,
137
  **kwargs,
 
153
  self.attention_dropout = attention_dropout
154
  self.num_experts_per_tok = num_experts_per_tok
155
  self.num_local_experts = num_local_experts
 
 
 
 
 
 
 
 
 
 
156
  self.router_aux_loss_coef = router_aux_loss_coef
157
  self.output_router_logits = output_router_logits
158