microsoft
/

Phi-3-vision-128k-instruct

@@ -13,13 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import math
 import torch
-import torch.nn as nn
-from transformers import CLIPVisionModel, PretrainedConfig
-from transformers import CLIPVisionConfig
 from transformers.utils import logging
-from datetime import datetime
 logger = logging.get_logger(__name__)
@@ -37,9 +42,42 @@ CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(
   num_channels=3,
   num_hidden_layers=24,
   patch_size=14,
-  projection_dim=768
 )
 class Phi3ImageEmbedding(nn.Module):
     """Phi3 Image embedding."""
@@ -65,6 +103,13 @@ class Phi3ImageEmbedding(nn.Module):
             self.img_processor = CLIPVisionModel(clip_config)
             image_dim_out = config.img_processor['image_dim_out']
             self.num_img_tokens = config.img_processor['num_img_tokens']
         else:
             raise NotImplementedError(f'img_processor = {config.img_processor}, not implemented')
@@ -157,15 +202,15 @@ class Phi3ImageEmbedding(nn.Module):
         with torch.no_grad():
             positions = torch.nonzero((input_ids < 0) & (input_ids > -MAX_INPUT_ID), as_tuple=False)
         select = False
-        if isinstance(self.img_projection, nn.Sequential):
-            target_device = self.img_projection[0].bias.device
-            target_dtype = self.img_projection[0].bias.dtype
-        else:  # It's a single nn.Linear layer
-            target_device = self.img_projection.bias.device
-            target_dtype = self.img_projection.bias.dtype
         if len(positions.tolist()) > 0:
             with torch.no_grad():
@@ -197,7 +242,7 @@ class Phi3ImageEmbedding(nn.Module):
                     img_sizes = img_sizes.view(-1, 2)
                 for _bs in range(bs):
                     h, w = img_sizes[_bs]
-                    h = h // 336
                     w = w // 336
                     B_ = h * w
@@ -235,7 +280,7 @@ class Phi3ImageEmbedding(nn.Module):
                     temp_len = int((h*w+1)*144 + 1 + (h+1)*12)
                     assert temp_len == output_imgs[-1].shape[1], f'temp_len: {temp_len}, output_imgs[-1].shape[1]: {output_imgs[-1].shape[1]}'
                     output_len.append(temp_len)
                 num_img_tokens = output_len
                 img_set_tensor = []
                 for _output_img in output_imgs:
@@ -267,10 +312,10 @@ class Phi3ImageEmbedding(nn.Module):
             else:
                 raise NotImplementedError
             select = True
         with torch.no_grad():
             input_ids.clamp_min_(0).clamp_max_(self.vocab_size)
         hidden_states = self.wte(input_ids)
         if select:

 # See the License for the specific language governing permissions and
 # limitations under the License.
+from datetime import datetime
 import torch
+from torch import nn
+from transformers import CLIPVisionConfig, CLIPVisionModel, PretrainedConfig
+from transformers.models.clip.modeling_clip import CLIPAttention
 from transformers.utils import logging
+try:
+    from flash_attn import flash_attn_func
+except ImportError:
+    pass
 logger = logging.get_logger(__name__)
   num_channels=3,
   num_hidden_layers=24,
   patch_size=14,
+  projection_dim=768
 )
+class CLIPAttentionFA2(CLIPAttention):
+    """Add flash attention 2 to CLIPAttention. (This is only used in the vision encoder)"""
+    def forward(self,
+        hidden_states,
+        attention_mask=None,
+        causal_attention_mask=None,
+        output_attentions=False,
+    ):
+        """Input shape: Batch x Time x Channel"""
+        assert attention_mask is None, "CLIPAttentionFA2 does not support attention_mask"
+        assert causal_attention_mask is None, "CLIPAttentionFA2 does not support causal_attention_mask"
+        assert output_attentions is False, "CLIPAttentionFA2 does not support output_attentions"
+        bsz, tgt_len, embed_dim = hidden_states.size()
+        query_states = self.q_proj(hidden_states).reshape(bsz, tgt_len, self.num_heads, self.head_dim)
+        key_states = self.k_proj(hidden_states).reshape(bsz, tgt_len, self.num_heads, self.head_dim)
+        value_states = self.v_proj(hidden_states).reshape(bsz, tgt_len, self.num_heads, self.head_dim)
+        attn_output = flash_attn_func(
+            query_states,
+            key_states,
+            value_states,
+            dropout_p=self.dropout if self.training else 0.0,
+            softmax_scale=self.scale,
+            causal=False,
+        ).reshape(bsz, tgt_len, embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, None
 class Phi3ImageEmbedding(nn.Module):
     """Phi3 Image embedding."""
             self.img_processor = CLIPVisionModel(clip_config)
             image_dim_out = config.img_processor['image_dim_out']
             self.num_img_tokens = config.img_processor['num_img_tokens']
+            # FA2 in CLIP
+            if config._attn_implementation == 'flash_attention_2':
+                for layer in self.img_processor.vision_model.encoder.layers:
+                    clip_fa2 = CLIPAttentionFA2(clip_config)
+                    del layer.self_attn
+                    layer.self_attn = clip_fa2
         else:
             raise NotImplementedError(f'img_processor = {config.img_processor}, not implemented')
         with torch.no_grad():
             positions = torch.nonzero((input_ids < 0) & (input_ids > -MAX_INPUT_ID), as_tuple=False)
         select = False
+        if isinstance(self.img_projection, nn.Sequential):
+            target_device = self.img_projection[0].bias.device
+            target_dtype = self.img_projection[0].bias.dtype
+        else:  # It's a single nn.Linear layer
+            target_device = self.img_projection.bias.device
+            target_dtype = self.img_projection.bias.dtype
         if len(positions.tolist()) > 0:
             with torch.no_grad():
                     img_sizes = img_sizes.view(-1, 2)
                 for _bs in range(bs):
                     h, w = img_sizes[_bs]
+                    h = h // 336
                     w = w // 336
                     B_ = h * w
                     temp_len = int((h*w+1)*144 + 1 + (h+1)*12)
                     assert temp_len == output_imgs[-1].shape[1], f'temp_len: {temp_len}, output_imgs[-1].shape[1]: {output_imgs[-1].shape[1]}'
                     output_len.append(temp_len)
                 num_img_tokens = output_len
                 img_set_tensor = []
                 for _output_img in output_imgs:
             else:
                 raise NotImplementedError
             select = True
         with torch.no_grad():
             input_ids.clamp_min_(0).clamp_max_(self.vocab_size)
         hidden_states = self.wte(input_ids)
         if select: