ltg
/

norbert3-xs

@@ -101,23 +101,6 @@ class FeedForward(nn.Module):
         return self.mlp(x)
-class MaskedSoftmax(torch.autograd.Function):
-    @staticmethod
-    def forward(self, x, mask, dim):
-        self.dim = dim
-        x.masked_fill_(mask, float('-inf'))
-        x = torch.softmax(x, self.dim)
-        x.masked_fill_(mask, 0.0)
-        self.save_for_backward(x)
-        return x
-    @staticmethod
-    def backward(self, grad_output):
-        output, = self.saved_tensors
-        input_grad = softmax_backward_data(self, grad_output, output, self.dim, output)
-        return input_grad, None, None
 class Attention(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -155,7 +138,7 @@ class Attention(nn.Module):
         bucket_pos = torch.where(abs_pos <= mid, relative_pos, log_pos * sign).long()
         return bucket_pos
-    def compute_attention_scores(self, hidden_states, relative_embedding):
         key_len, batch_size, _ = hidden_states.size()
         query_len = key_len
@@ -193,21 +176,17 @@ class Attention(nn.Module):
         attention_scores.add_(attention_c_p)
         attention_scores.add_(attention_p_c)
-        return attention_scores, value
-    def compute_output(self, attention_probs, value):
         attention_probs = self.dropout(attention_probs)
         context = torch.bmm(attention_probs.flatten(0, 1), value)  # shape: [B*H, Q, D]
         context = context.transpose(0, 1).reshape(context.size(1), -1, self.hidden_size)  # shape: [Q, B, H*D]
         context = self.out_proj(context)
         context = self.post_layer_norm(context)
         context = self.dropout(context)
-        return context
-    def forward(self, hidden_states, attention_mask, relative_embedding):
-        attention_scores, value = self.compute_attention_scores(hidden_states, relative_embedding)
-        attention_probs = MaskedSoftmax.apply(attention_scores, attention_mask, -1)
-        return self.compute_output(attention_probs, value), attention_probs.detach()
 class Embedding(nn.Module):

         return self.mlp(x)
 class Attention(nn.Module):
     def __init__(self, config):
         super().__init__()
         bucket_pos = torch.where(abs_pos <= mid, relative_pos, log_pos * sign).long()
         return bucket_pos
+    def forward(self, hidden_states, attention_mask, relative_embedding):
         key_len, batch_size, _ = hidden_states.size()
         query_len = key_len
         attention_scores.add_(attention_c_p)
         attention_scores.add_(attention_p_c)
+        attention_scores = attention_scores.masked_fill(attention_mask, float('-inf'))
+        attention_probs = F.softmax(attention_scores, dim=-1)
         attention_probs = self.dropout(attention_probs)
         context = torch.bmm(attention_probs.flatten(0, 1), value)  # shape: [B*H, Q, D]
         context = context.transpose(0, 1).reshape(context.size(1), -1, self.hidden_size)  # shape: [Q, B, H*D]
         context = self.out_proj(context)
         context = self.post_layer_norm(context)
         context = self.dropout(context)
+        return context, attention_probs.detach()
 class Embedding(nn.Module):