Crystalcareai
/

GemMoE-Beta-1

Text Generation

Transformers

gemmoe

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 15, 2024

Commit

cb8455c

verified ·

1 Parent(s): 0f57763

Update modeling_gemmoe.py

Browse files

Files changed (1) hide show

modeling_gemmoe.py +16 -47

modeling_gemmoe.py CHANGED Viewed

@@ -683,14 +683,7 @@ class GemmoeBlockSparseTop2MLP(GemmoeBlockSparseTop2MLP):
 class GemmoeSparseMoeBlock(nn.Module):
     """
-    This implementation is
-    strictly equivalent to standard MoE with full capacity (no
-    dropped tokens). It's faster since it formulates MoE operations
-    in terms of block-sparse operations to accomodate imbalanced
-    assignments of tokens to experts, whereas standard MoE either
-    (1) drop tokens at the cost of reduced performance or (2) set
-    capacity factor to number of experts and thus waste computation
-    and memory on padding.
     """
     def __init__(self, config):
@@ -706,51 +699,26 @@ class GemmoeSparseMoeBlock(nn.Module):
         self.experts = nn.ModuleList([GemmoeBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        """ """
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         # router_logits: (batch * sequence_length, n_experts)
         router_logits = self.gate(hidden_states)
         routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
-        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
-        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-        # we cast back to the input dtype
-        routing_weights = routing_weights.to(hidden_states.dtype)
-        final_hidden_states = torch.zeros(
-            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
-        )
-        # One hot encode the selected experts to create an expert mask
-        # this will be used to easily index which expert is going to be sollicitated
-        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
-        # Loop over all available experts in the model and perform the computation on each expert
-        for expert_idx in range(self.num_experts):
-            expert_layer = self.experts[expert_idx]
-            idx, top_x = torch.where(expert_mask[expert_idx])
-            if top_x.shape[0] == 0:
-                for param in expert_layer.parameters():
-                    if param.requires_grad:
-                        param.grad = torch.zeros_like(param)
-                continue
-            # in torch it is faster to index using lists than torch tensors
-            top_x_list = top_x.tolist()
-            idx_list = idx.tolist()
-            # Index the correct hidden states and compute the expert hidden state for
-            # the current expert. We need to make sure to multiply the output hidden
-            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
-            current_state = hidden_states[None, top_x_list].reshape(-1, hidden_dim)
-            current_hidden_states = expert_layer(current_state) * routing_weights[top_x_list, idx_list, None]
-            # However `index_add_` only support torch tensors for indexing so we'll use
-            # the `top_x` tensor here.
-            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
-        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
         return final_hidden_states, router_logits
@@ -1297,6 +1265,7 @@ class GemmoeForCausalLM(GemmoePreTrainedModel):
         hidden_states = outputs[0]
         logits = self.lm_head(hidden_states)
         logits = logits.float()
         if self.training:
             for expert in self.model.layers[-1].block_sparse_moe.experts:
                 for param in expert.parameters():

 class GemmoeSparseMoeBlock(nn.Module):
     """
+    This implementation is strictly equivalent to standard MoE with full capacity (no dropped tokens). It's faster since it formulates MoE operations in terms of block-sparse operations to accommodate imbalanced assignments of tokens to experts.
     """
     def __init__(self, config):
         self.experts = nn.ModuleList([GemmoeBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         # router_logits: (batch * sequence_length, n_experts)
         router_logits = self.gate(hidden_states)
         routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        topk_weight, topk_idx = torch.topk(routing_weights, self.top_k, dim=-1, sorted=False)
+        topk_weight /= topk_weight.sum(dim=-1, keepdim=True)
+        topk_weight = topk_weight.to(hidden_states.dtype)
+        hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0)
+        y = torch.empty_like(hidden_states)
+        flat_topk_idx = topk_idx.view(-1)
+        for i in range(self.num_experts):
+            expert = self.experts[i]
+            mask = flat_topk_idx == i
+            if mask.any():
+                y[mask] = expert(hidden_states[mask])
+        y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
+        final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim)
         return final_hidden_states, router_logits
         hidden_states = outputs[0]
         logits = self.lm_head(hidden_states)
         logits = logits.float()
         if self.training:
             for expert in self.model.layers[-1].block_sparse_moe.experts:
                 for param in expert.parameters():