ybelkada
/

llama-xformers

Model card Files Files and versions Community

ybelkada commited on Nov 15, 2023

Commit

d281c7a

•

1 Parent(s): 1141922

Update llama_xformers_attention.py

Browse files

Files changed (1) hide show

llama_xformers_attention.py +16 -28

llama_xformers_attention.py CHANGED Viewed

@@ -3,7 +3,7 @@ import torch.nn as nn
 from typing import Optional, Tuple
-from transformers.models.llama.modeling_llama import LlamaAttention, apply_rotary_pos_emb
 from xformers.ops.fmha import (
  memory_efficient_attention,
@@ -51,33 +51,21 @@ class LlamaXFormersAttention(LlamaAttention):
  key_states = repeat_kv(key_states, self.num_key_value_groups)
  value_states = repeat_kv(value_states, self.num_key_value_groups)
- attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
- if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
- raise ValueError(
- f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
- f" {attn_weights.size()}"
- )
- if attention_mask is not None:
- if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
- raise ValueError(
- f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
- )
- attn_weights = attn_weights + attention_mask
- # upcast attention to fp32
- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
- attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
- attn_output = torch.matmul(attn_weights, value_states)
- if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
- raise ValueError(
- f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
- f" {attn_output.size()}"
- )
- attn_output = attn_output.transpose(1, 2).contiguous()
  attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

 from typing import Optional, Tuple
+from transformers.models.llama.modeling_llama import LlamaAttention, apply_rotary_pos_emb, repeat_kv
 from xformers.ops.fmha import (
  memory_efficient_attention,
  key_states = repeat_kv(key_states, self.num_key_value_groups)
  value_states = repeat_kv(value_states, self.num_key_value_groups)
+ dtype = query_states.dtype
+ query_states = query_states.transpose(1, 2)
+ key_states = key_states.transpose(1, 2)
+ value_states = value_states.transpose(1, 2)
+ #This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros.
+ #We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros.
+ if attention_mask is None or attention_mask[0, 0, 0, 1] == 0:
+ # input and output should be of form (bsz, q_len, num_heads, head_dim)
+ attn_output = memory_efficient_attention(query_states, key_states, value_states, attn_bias=None)
+ else:
+ # input and output should be of form (bsz, q_len, num_heads, head_dim)
+ attn_output = memory_efficient_attention(query_states, key_states, value_states, attn_bias=xformers.ops.LowerTriangularMask())
+ attn_weights = None
  attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)