updating model class and logo

- fixing multi-device training for model
- updating logo to sharpened version

Files changed (2) hide show

lola-logo.png CHANGED Viewed

modeling_lola_gpt2.py CHANGED Viewed

@@ -204,7 +204,7 @@ class LOLAModel(GPT2PreTrainedModel):
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            # self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
             batch_size = input_ids.shape[0]
@@ -537,6 +537,12 @@ class LOLALMHeadModel(GPT2LMHeadModel):
             return_dict=True,  # Ensure we get a MoeModelOutputWithPast
         )
         hidden_states = transformer_outputs.last_hidden_state
         lm_logits = self.lm_head(hidden_states)
         aux_loss = transformer_outputs.aux_loss if hasattr(transformer_outputs, 'aux_loss') else None

         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
             batch_size = input_ids.shape[0]
             return_dict=True,  # Ensure we get a MoeModelOutputWithPast
         )
         hidden_states = transformer_outputs.last_hidden_state
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.transformer.first_device)
+            hidden_states = hidden_states.to(self.lm_head.weight.device)
         lm_logits = self.lm_head(hidden_states)
         aux_loss = transformer_outputs.aux_loss if hasattr(transformer_outputs, 'aux_loss') else None