Fill-Mask
Transformers
PyTorch
Safetensors
bert
custom_code
robinzixuan commited on
Commit
8b5e38b
·
verified ·
1 Parent(s): 9b45691

Update modeling_bert.py

Browse files
Files changed (1) hide show
  1. modeling_bert.py +2 -1
modeling_bert.py CHANGED
@@ -384,7 +384,8 @@ class BertSelfAttention(nn.Module):
384
  attention_scores = attention_scores + attention_mask
385
 
386
  # Normalize the attention scores to probabilities.
387
- attention_probs = nn.functional.softmax(attention_scores, dim=-1)
 
388
 
389
  # This is actually dropping out entire tokens to attend to, which might
390
  # seem a bit unusual, but is taken from the original Transformer paper.
 
384
  attention_scores = attention_scores + attention_mask
385
 
386
  # Normalize the attention scores to probabilities.
387
+ #attention_probs = nn.functional.softmax(attention_scores, dim=-1)
388
+ attention_probs = softmax_1(attention_scores, dim=-1)
389
 
390
  # This is actually dropping out entire tokens to attend to, which might
391
  # seem a bit unusual, but is taken from the original Transformer paper.