a8nova
/

OpenELM-270M-Instruct

Text Generation

Model card Files Files and versions Community

a8nova commited on May 19

Commit

2c39b54

•

1 Parent(s): 35f1c60

Update modeling_openelm.py

Files changed (1) hide show

modeling_openelm.py +5 -3

modeling_openelm.py CHANGED Viewed

@@ -778,9 +778,11 @@ class OpenELMModel(OpenELMPreTrainedModel):
             padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[
                 :, None, None, :
             ].eq(0.0)
-            causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(
-                padding_mask, min_dtype
-            )
         if self.config._attn_implementation == "sdpa" and attention_mask is not None:
             # For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400).

             padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[
                 :, None, None, :
             ].eq(0.0)
+            causal_mask = causal_mask.clone()
+            causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(...)
+            #causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(
+            #    padding_mask, min_dtype
+            #)
         if self.config._attn_implementation == "sdpa" and attention_mask is not None:
             # For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400).