Crystalcareai
/

GemMoE-Beta-1

Text Generation

Model card Files Files and versions Community

Crystalcareai commited on Mar 14, 2024

Commit

9e9951f

·

verified ·

1 Parent(s): e8a1698

Update modeling_gemmoe.py

Files changed (1) hide show

modeling_gemmoe.py +4 -14

modeling_gemmoe.py CHANGED Viewed

@@ -1220,14 +1220,6 @@ class GemmoeForCausalLM(GemmoePreTrainedModel):
         hidden_states = hidden_states.to(dtype=self.lm_head.weight.dtype)
         logits = self.lm_head(hidden_states)
-        logits = logits.float()
-        # Handle unused parameters
-        if self.training:
-            for expert in self.model.layers[-1].block_sparse_moe.experts:
-                for param in expert.parameters():
-                    if param.requires_grad and param.grad is None:
-                        param.grad = torch.zeros_like(param)
         loss = None
         if labels is not None:
@@ -1306,8 +1298,8 @@ class GemmoeForCausalLM(GemmoePreTrainedModel):
                 past_length = 0
             else:
                 past_length = cache_position[-1] + 1
-            input_ids = input_ids[:, past_length:]
-            position_ids = position_ids[:, past_length:]
         cache_position = torch.arange(past_length, past_length + position_ids.shape[-1], device=position_ids.device)
@@ -1426,10 +1418,8 @@ class GemmoeForSequenceClassification(GemmoePreTrainedModel):
             sequence_lengths = -1
         else:
             if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
             else:
                 sequence_lengths = -1

         hidden_states = hidden_states.to(dtype=self.lm_head.weight.dtype)
         logits = self.lm_head(hidden_states)
         loss = None
         if labels is not None:
                 past_length = 0
             else:
                 past_length = cache_position[-1] + 1
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+            position_ids = position_ids[:, -1].unsqueeze(-1)
         cache_position = torch.arange(past_length, past_length + position_ids.shape[-1], device=position_ids.device)
             sequence_lengths = -1
         else:
             if input_ids is not None:
+                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
+                sequence_lengths = sequence_lengths.clamp(min=0).to(logits.device)
             else:
                 sequence_lengths = -1