IDEA-FinAI
/

chartmoe

@@ -25,6 +25,7 @@ import torch
 import torch.utils.checkpoint
 from einops import rearrange
 from torch import nn
 from transformers.activations import ACT2FN
 from transformers.modeling_outputs import BaseModelOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
@@ -42,6 +43,30 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = 'InternLM2Config'
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(input_ids_shape: torch.Size,
@@ -264,21 +289,21 @@ class InternLM2MLP(nn.Module):
             bias=False,
             lora_r=256,
             lora_alpha=256,
-            lora_len=576)
         self.w3 = PLoRA(
             self.hidden_size,
             self.intermediate_size,
             bias=False,
             lora_r=256,
             lora_alpha=256,
-            lora_len=576)
         self.w2 = PLoRA(
             self.intermediate_size,
             self.hidden_size,
             bias=False,
             lora_r=256,
             lora_alpha=256,
-            lora_len=576)
         self.act_fn = ACT2FN[config.hidden_act]
@@ -332,7 +357,7 @@ class InternLM2Attention(nn.Module):
             bias=config.bias,
             lora_r=256,
             lora_alpha=256,
-            lora_len=576)
         self.wo = PLoRA(
             self.num_heads * self.head_dim,
@@ -340,7 +365,7 @@ class InternLM2Attention(nn.Module):
             bias=config.bias,
             lora_r=256,
             lora_alpha=256,
-            lora_len=576)
         self._init_rope()
     def _init_rope(self):
@@ -498,7 +523,7 @@ class InternLM2FlashAttention2(InternLM2Attention):
         qkv_states = rearrange(
             qkv_states,
             'b q (h gs d) -> b q h gs d',
-            gs=self.num_heads + 2 * self.num_key_value_heads,
             d=self.head_dim,
             q=q_len,
         )
@@ -507,6 +532,10 @@ class InternLM2FlashAttention2(InternLM2Attention):
         query_states = rearrange(query_states, 'b q h gs d -> b q (h gs) d')
         key_states = qkv_states[..., -2, :]
         value_states = qkv_states[..., -1, :]
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
@@ -523,12 +552,12 @@ class InternLM2FlashAttention2(InternLM2Attention):
             value_states = torch.cat([past_key_value[1], value_states], dim=2)
         past_key_value = (key_states, value_states) if use_cache else None
         query_states = query_states.transpose(1, 2)
         key_states = key_states.transpose(1, 2)
         value_states = value_states.transpose(1, 2)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
         # In PEFT, usually we cast the layer norms in float32 for training stability reasons
         # therefore the input hidden states gets silently casted in float32. Hence, we need
@@ -569,17 +598,110 @@ class InternLM2FlashAttention2(InternLM2Attention):
             attn_weights = None
         return attn_output, attn_weights, past_key_value
 class InternLM2DecoderLayer(nn.Module):
     def __init__(self, config: InternLM2Config):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.attention = (
-            InternLM2Attention(config=config)
-            if not getattr(config, '_flash_attn_2_enabled', False) else
-            InternLM2FlashAttention2(config=config))
         self.feed_forward = InternLM2MLP(config)
         self.attention_norm = InternLM2RMSNorm(
             config.hidden_size, eps=config.rms_norm_eps)
@@ -773,6 +895,8 @@ class InternLM2Model(InternLM2PreTrainedModel):
     def __init__(self, config: InternLM2Config):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -843,6 +967,9 @@ class InternLM2Model(InternLM2PreTrainedModel):
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
@@ -876,14 +1003,18 @@ class InternLM2Model(InternLM2PreTrainedModel):
             inputs_embeds = self.tok_embeddings(input_ids)
             im_mask = torch.zeros(inputs_embeds.shape[:2]).to(
                 inputs_embeds.device).bool()
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones((batch_size, seq_length_with_past),
-                                        dtype=torch.bool,
-                                        device=inputs_embeds.device)
-        attention_mask = self._prepare_decoder_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds,
-            past_key_values_length)
         # embed positions
         hidden_states = inputs_embeds

 import torch.utils.checkpoint
 from einops import rearrange
 from torch import nn
+import torch.nn.functional as F
 from transformers.activations import ACT2FN
 from transformers.modeling_outputs import BaseModelOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
 _CONFIG_FOR_DOC = 'InternLM2Config'
+flash_attn_func, flash_attn_varlen_func = None, None
+pad_input, index_first_axis, unpad_input = None, None, None
+def _import_flash_attn():
+    global flash_attn_func, flash_attn_varlen_func
+    global pad_input, index_first_axis, unpad_input
+    try:
+        from flash_attn import flash_attn_func as _flash_attn_func, flash_attn_varlen_func as _flash_attn_varlen_func
+        from flash_attn.bert_padding import pad_input as _pad_input, index_first_axis as _index_first_axis, unpad_input as _unpad_input
+        flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func
+        pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input
+    except ImportError:
+        raise ImportError("flash_attn is not installed.")
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(input_ids_shape: torch.Size,
             bias=False,
             lora_r=256,
             lora_alpha=256,
+            lora_len=1225)
         self.w3 = PLoRA(
             self.hidden_size,
             self.intermediate_size,
             bias=False,
             lora_r=256,
             lora_alpha=256,
+            lora_len=1225)
         self.w2 = PLoRA(
             self.intermediate_size,
             self.hidden_size,
             bias=False,
             lora_r=256,
             lora_alpha=256,
+            lora_len=1225)
         self.act_fn = ACT2FN[config.hidden_act]
             bias=config.bias,
             lora_r=256,
             lora_alpha=256,
+            lora_len=1225)
         self.wo = PLoRA(
             self.num_heads * self.head_dim,
             bias=config.bias,
             lora_r=256,
             lora_alpha=256,
+            lora_len=1225)
         self._init_rope()
     def _init_rope(self):
         qkv_states = rearrange(
             qkv_states,
             'b q (h gs d) -> b q h gs d',
+            gs=2 + self.num_key_value_groups,
             d=self.head_dim,
             q=q_len,
         )
         query_states = rearrange(query_states, 'b q h gs d -> b q (h gs) d')
         key_states = qkv_states[..., -2, :]
         value_states = qkv_states[..., -1, :]
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             value_states = torch.cat([past_key_value[1], value_states], dim=2)
         past_key_value = (key_states, value_states) if use_cache else None
         query_states = query_states.transpose(1, 2)
         key_states = key_states.transpose(1, 2)
         value_states = value_states.transpose(1, 2)
+        dropout_rate = 0.0 if not self.training else getattr(self, "dropout_rate", 0.0)
         # In PEFT, usually we cast the layer norms in float32 for training stability reasons
         # therefore the input hidden states gets silently casted in float32. Hence, we need
             attn_weights = None
         return attn_output, attn_weights, past_key_value
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        # Contains at least one padding token in the sequence
+        causal = self.is_causal and query_length != 1
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+            )
+        return attn_output
+    def _unpad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q.to(torch.int64),
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+INTERNLM2_ATTENTION_CLASSES = {
+    "eager": InternLM2Attention,
+    "flash_attention_2": InternLM2FlashAttention2,
+}
 class InternLM2DecoderLayer(nn.Module):
     def __init__(self, config: InternLM2Config):
         super().__init__()
         self.hidden_size = config.hidden_size
+        self.attention = INTERNLM2_ATTENTION_CLASSES[config.attn_implementation](config=config)
         self.feed_forward = InternLM2MLP(config)
         self.attention_norm = InternLM2RMSNorm(
             config.hidden_size, eps=config.rms_norm_eps)
     def __init__(self, config: InternLM2Config):
         super().__init__(config)
+        print(f"Attention Implementation: {self.config.attn_implementation}")
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if self.config.attn_implementation == "flash_attention_2":
+            _import_flash_attn()
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
             inputs_embeds = self.tok_embeddings(input_ids)
             im_mask = torch.zeros(inputs_embeds.shape[:2]).to(
                 inputs_embeds.device).bool()
+        if self.config.attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        else:
+            if attention_mask is None:
+                attention_mask = torch.ones((batch_size, seq_length_with_past),
+                                            dtype=torch.bool,
+                                            device=inputs_embeds.device)
+            attention_mask = self._prepare_decoder_attention_mask(
+                attention_mask, (batch_size, seq_length), inputs_embeds,
+                past_key_values_length)
         # embed positions
         hidden_states = inputs_embeds