radna
/

mini_intern_chat_triton

Visual Question Answering

Transformers

Safetensors

internvl_chat

feature-extraction

custom_code

Model card Files Files and versions Community

HGB commited on Jun 9, 2024

Commit

e065dd1

1 Parent(s): 9c8bb9e

remove formatting

Browse files

Files changed (1) hide show

modeling_intern_vit.py +207 -122

modeling_intern_vit.py CHANGED Viewed

@@ -12,13 +12,13 @@ from einops import rearrange
 from timm.models.layers import DropPath
 from torch import nn
 from transformers.activations import ACT2FN
-from transformers.modeling_outputs import (BaseModelOutput,
-                                           BaseModelOutputWithPooling)
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
 from .configuration_intern_vit import InternVisionConfig
 try:
     from triton_flash_atn import _attention
@@ -26,7 +26,7 @@ try:
     has_flash_attn = True
 except:
-    print('FlashAttention is not installed.')
     has_flash_attn = False
 logger = logging.get_logger(__name__)
@@ -43,13 +43,22 @@ class FlashAttention(nn.Module):
                            (default: 0.0)
     """
-    def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
         super().__init__()
         self.softmax_scale = softmax_scale
         self.dropout_p = attention_dropout
-    def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
-                max_s=None, need_weights=False):
         """Implements the multihead softmax attention.
         Arguments
         ---------
@@ -65,35 +74,58 @@ class FlashAttention(nn.Module):
             batch_size = qkv.shape[0]
             seqlen = qkv.shape[1]
             if key_padding_mask is None:
-                qkv = rearrange(qkv, 'b s ... -> (b s) ...')
                 max_s = seqlen
-                cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
-                                          device=qkv.device)
                 output = _attention.apply(
-                    qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
-                    sm_scale=self.softmax_scale, causal=causal
                 )
-                output = rearrange(
-                    output, '(b s) ... -> b s ...', b=batch_size)
             else:
                 nheads = qkv.shape[-2]
-                x = rearrange(qkv, 'b s three h d -> b s (three h d)')
-                x_unpad, indices, cu_seqlens, max_s = unpad_input(
-                    x, key_padding_mask)
                 x_unpad = rearrange(
-                    x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
                 output_unpad = _attention.apply(
-                    x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
-                    sm_scale=self.softmax_scale, causal=causal
                 )
-                output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
-                                             indices, batch_size, seqlen),
-                                   'b s (h d) -> b s h d', h=nheads)
         else:
             assert max_s is not None
             output = _attention.apply(
-                qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
-                sm_scale=self.softmax_scale, causal=causal
             )
         return output, None
@@ -109,8 +141,7 @@ class InternRMSNorm(nn.Module):
         input_dtype = hidden_states.dtype
         hidden_states = hidden_states.to(torch.float32)
         variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * \
-            torch.rsqrt(variance + self.variance_epsilon)
         return self.weight * hidden_states.to(input_dtype)
@@ -120,19 +151,21 @@ try:
     InternRMSNorm = FusedRMSNorm  # noqa
     logger.info(
-        'Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm')
 except ImportError:
     # using the normal InternRMSNorm
     pass
 except Exception:
     logger.warning(
-        'discovered apex but it failed to load, falling back to InternRMSNorm')
     pass
 NORM2FN = {
-    'rms_norm': InternRMSNorm,
-    'layer_norm': nn.LayerNorm,
 }
@@ -149,21 +182,37 @@ class InternVisionEmbeddings(nn.Module):
         )
         self.patch_embedding = nn.Conv2d(
-            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
         )
         self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Parameter(
-            torch.randn(1, self.num_positions, self.embed_dim))
     def _get_pos_embed(self, pos_embed, H, W):
         target_dtype = pos_embed.dtype
-        pos_embed = pos_embed.float().reshape(
-            1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
-        pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False). \
-            reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype)
         return pos_embed
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
@@ -172,14 +221,15 @@ class InternVisionEmbeddings(nn.Module):
         patch_embeds = self.patch_embedding(pixel_values)
         batch_size, _, height, width = patch_embeds.shape
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
-        class_embeds = self.class_embedding.expand(
-            batch_size, 1, -1).to(target_dtype)
         embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
-        position_embedding = torch.cat([
-            self.position_embedding[:, :1, :],
-            self._get_pos_embed(
-                self.position_embedding[:, 1:, :], height, width)
-        ], dim=1)
         embeddings = embeddings + position_embedding.to(target_dtype)
         return embeddings
@@ -195,49 +245,54 @@ class InternAttention(nn.Module):
         self.use_flash_attn = config.use_flash_attn and has_flash_attn
         if config.use_flash_attn and not has_flash_attn:
             print(
-                'Warning: Flash Attention is not available, use_flash_attn is set to False.')
         self.head_dim = self.embed_dim // self.num_heads
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
-                f'embed_dim must be divisible by num_heads (got `embed_dim`: {
-                    self.embed_dim} and `num_heads`:'
-                f' {self.num_heads}).'
             )
-        self.scale = self.head_dim ** -0.5
-        self.qkv = nn.Linear(self.embed_dim, 3 *
-                             self.embed_dim, bias=config.qkv_bias)
         self.attn_drop = nn.Dropout(config.attention_dropout)
         self.proj_drop = nn.Dropout(config.dropout)
         self.qk_normalization = config.qk_normalization
         if self.qk_normalization:
-            self.q_norm = InternRMSNorm(
-                self.embed_dim, eps=config.layer_norm_eps)
-            self.k_norm = InternRMSNorm(
-                self.embed_dim, eps=config.layer_norm_eps)
         if self.use_flash_attn:
-            self.inner_attn = FlashAttention(
-                attention_dropout=config.attention_dropout)
         self.proj = nn.Linear(self.embed_dim, self.embed_dim)
     def _naive_attn(self, x):
         B, N, C = x.shape
-        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C //
-                                  self.num_heads).permute(2, 0, 3, 1, 4)
         # make torchscript happy (cannot use tensor as tuple)
         q, k, v = qkv.unbind(0)
         if self.qk_normalization:
             B_, H_, N_, D_ = q.shape
-            q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)
-                            ).view(B_, N_, H_, D_).transpose(1, 2)
-            k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)
-                            ).view(B_, N_, H_, D_).transpose(1, 2)
-        attn = ((q * self.scale) @ k.transpose(-2, -1))
         attn = attn.softmax(dim=-1)
         attn = self.attn_drop(attn)
@@ -248,8 +303,9 @@ class InternAttention(nn.Module):
     def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
         qkv = self.qkv(x)
-        qkv = rearrange(qkv, 'b s (three h d) -> b s three h d',
-                        three=3, h=self.num_heads)
         if self.qk_normalization:
             q, k, v = qkv.unbind(2)
@@ -258,15 +314,21 @@ class InternAttention(nn.Module):
             qkv = torch.stack([q, k, v], dim=2)
         context, _ = self.inner_attn(
-            qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
         )
-        outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
         outs = self.proj_drop(outs)
         return outs
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        x = self._naive_attn(
-            hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
         return x
@@ -294,33 +356,37 @@ class InternVisionEncoderLayer(nn.Module):
         self.attn = InternAttention(config)
         self.mlp = InternMLP(config)
-        self.norm1 = NORM2FN[self.norm_type](
-            self.embed_dim, eps=config.layer_norm_eps)
-        self.norm2 = NORM2FN[self.norm_type](
-            self.embed_dim, eps=config.layer_norm_eps)
-        self.ls1 = nn.Parameter(
-            config.initializer_factor * torch.ones(self.embed_dim))
-        self.ls2 = nn.Parameter(
-            config.initializer_factor * torch.ones(self.embed_dim))
-        self.drop_path1 = DropPath(
-            drop_path_rate) if drop_path_rate > 0. else nn.Identity()
-        self.drop_path2 = DropPath(
-            drop_path_rate) if drop_path_rate > 0. else nn.Identity()
     def forward(
-            self,
-            hidden_states: torch.Tensor,
-    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
         """
         Args:
             hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
         """
-        hidden_states = hidden_states + \
-            self.drop_path1(self.attn(self.norm1(hidden_states)) * self.ls1)
-        hidden_states = hidden_states + \
-            self.drop_path2(self.mlp(self.norm2(hidden_states)) * self.ls2)
         return hidden_states
@@ -339,17 +405,23 @@ class InternVisionEncoder(nn.Module):
         super().__init__()
         self.config = config
         # stochastic depth decay rule
-        dpr = [x.item() for x in torch.linspace(
-            0, config.drop_path_rate, config.num_hidden_layers)]
-        self.layers = nn.ModuleList([
-            InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
         self.gradient_checkpointing = True
     def forward(
-            self,
-            inputs_embeds,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutput]:
         r"""
         Args:
@@ -362,9 +434,13 @@ class InternVisionEncoder(nn.Module):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         encoder_states = () if output_hidden_states else None
         hidden_states = inputs_embeds
@@ -374,8 +450,8 @@ class InternVisionEncoder(nn.Module):
                 encoder_states = encoder_states + (hidden_states,)
             if self.gradient_checkpointing and self.training:
                 layer_outputs = torch.utils.checkpoint.checkpoint(
-                    encoder_layer,
-                    hidden_states)
             else:
                 layer_outputs = encoder_layer(
                     hidden_states,
@@ -393,9 +469,9 @@ class InternVisionEncoder(nn.Module):
 class InternVisionModel(PreTrainedModel):
-    main_input_name = 'pixel_values'
     config_class = InternVisionConfig
-    _no_split_modules = ['InternVisionEncoderLayer']
     def __init__(self, config: InternVisionConfig):
         super().__init__(config)
@@ -408,36 +484,46 @@ class InternVisionModel(PreTrainedModel):
         pos_emb = self.embeddings.position_embedding
         _, num_positions, embed_dim = pos_emb.shape
         cls_emb = pos_emb[:, :1, :]
-        pos_emb = pos_emb[:, 1:, :].reshape(
-            1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2)
-        pos_emb = F.interpolate(pos_emb.float(
-        ), size=new_size // patch_size, mode='bicubic', align_corners=False)
-        pos_emb = pos_emb.to(cls_emb.dtype).reshape(
-            1, embed_dim, -1).permute(0, 2, 1)
         pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
         self.embeddings.position_embedding = nn.Parameter(pos_emb)
         self.embeddings.image_size = new_size
-        logger.info('Resized position embeddings from {} to {}'.format(
-            old_size, new_size))
     def get_input_embeddings(self):
         return self.embeddings
     def forward(
-            self,
-            pixel_values: Optional[torch.FloatTensor] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-            pixel_embeds: Optional[torch.FloatTensor] = None,
     ) -> Union[Tuple, BaseModelOutputWithPooling]:
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if pixel_values is None and pixel_embeds is None:
-            raise ValueError(
-                'You have to specify pixel_values or pixel_embeds')
         if pixel_embeds is not None:
             hidden_states = pixel_embeds
@@ -445,8 +531,7 @@ class InternVisionModel(PreTrainedModel):
             if len(pixel_values.shape) == 4:
                 hidden_states = self.embeddings(pixel_values)
             else:
-                raise ValueError(f'wrong pixel_values size: {
-                                 pixel_values.shape}')
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
             output_hidden_states=output_hidden_states,

 from timm.models.layers import DropPath
 from torch import nn
 from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
 from .configuration_intern_vit import InternVisionConfig
 try:
     from triton_flash_atn import _attention
     has_flash_attn = True
 except:
+    print("FlashAttention is not installed.")
     has_flash_attn = False
 logger = logging.get_logger(__name__)
                            (default: 0.0)
     """
+    def __init__(
+        self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None
+    ):
         super().__init__()
         self.softmax_scale = softmax_scale
         self.dropout_p = attention_dropout
+    def forward(
+        self,
+        qkv,
+        key_padding_mask=None,
+        causal=False,
+        cu_seqlens=None,
+        max_s=None,
+        need_weights=False,
+    ):
         """Implements the multihead softmax attention.
         Arguments
         ---------
             batch_size = qkv.shape[0]
             seqlen = qkv.shape[1]
             if key_padding_mask is None:
+                qkv = rearrange(qkv, "b s ... -> (b s) ...")
                 max_s = seqlen
+                cu_seqlens = torch.arange(
+                    0,
+                    (batch_size + 1) * seqlen,
+                    step=seqlen,
+                    dtype=torch.int32,
+                    device=qkv.device,
+                )
                 output = _attention.apply(
+                    qkv,
+                    cu_seqlens,
+                    max_s,
+                    self.dropout_p if self.training else 0.0,
+                    sm_scale=self.softmax_scale,
+                    causal=causal,
                 )
+                output = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
             else:
                 nheads = qkv.shape[-2]
+                x = rearrange(qkv, "b s three h d -> b s (three h d)")
+                x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
                 x_unpad = rearrange(
+                    x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads
+                )
                 output_unpad = _attention.apply(
+                    x_unpad,
+                    cu_seqlens,
+                    max_s,
+                    self.dropout_p if self.training else 0.0,
+                    sm_scale=self.softmax_scale,
+                    causal=causal,
+                )
+                output = rearrange(
+                    pad_input(
+                        rearrange(output_unpad, "nnz h d -> nnz (h d)"),
+                        indices,
+                        batch_size,
+                        seqlen,
+                    ),
+                    "b s (h d) -> b s h d",
+                    h=nheads,
                 )
         else:
             assert max_s is not None
             output = _attention.apply(
+                qkv,
+                cu_seqlens,
+                max_s,
+                self.dropout_p if self.training else 0.0,
+                sm_scale=self.softmax_scale,
+                causal=causal,
             )
         return output, None
         input_dtype = hidden_states.dtype
         hidden_states = hidden_states.to(torch.float32)
         variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
         return self.weight * hidden_states.to(input_dtype)
     InternRMSNorm = FusedRMSNorm  # noqa
     logger.info(
+        "Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm"
+    )
 except ImportError:
     # using the normal InternRMSNorm
     pass
 except Exception:
     logger.warning(
+        "discovered apex but it failed to load, falling back to InternRMSNorm"
+    )
     pass
 NORM2FN = {
+    "rms_norm": InternRMSNorm,
+    "layer_norm": nn.LayerNorm,
 }
         )
         self.patch_embedding = nn.Conv2d(
+            in_channels=3,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
         )
         self.num_patches = (self.image_size // self.patch_size) ** 2
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Parameter(
+            torch.randn(1, self.num_positions, self.embed_dim)
+        )
     def _get_pos_embed(self, pos_embed, H, W):
         target_dtype = pos_embed.dtype
+        pos_embed = (
+            pos_embed.float()
+            .reshape(
+                1,
+                self.image_size // self.patch_size,
+                self.image_size // self.patch_size,
+                -1,
+            )
+            .permute(0, 3, 1, 2)
+        )
+        pos_embed = (
+            F.interpolate(pos_embed, size=(H, W), mode="bicubic", align_corners=False)
+            .reshape(1, -1, H * W)
+            .permute(0, 2, 1)
+            .to(target_dtype)
+        )
         return pos_embed
     def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
         patch_embeds = self.patch_embedding(pixel_values)
         batch_size, _, height, width = patch_embeds.shape
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
         embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        position_embedding = torch.cat(
+            [
+                self.position_embedding[:, :1, :],
+                self._get_pos_embed(self.position_embedding[:, 1:, :], height, width),
+            ],
+            dim=1,
+        )
         embeddings = embeddings + position_embedding.to(target_dtype)
         return embeddings
         self.use_flash_attn = config.use_flash_attn and has_flash_attn
         if config.use_flash_attn and not has_flash_attn:
             print(
+                "Warning: Flash Attention is not available, use_flash_attn is set to False."
+            )
         self.head_dim = self.embed_dim // self.num_heads
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
             )
+        self.scale = self.head_dim**-0.5
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
         self.attn_drop = nn.Dropout(config.attention_dropout)
         self.proj_drop = nn.Dropout(config.dropout)
         self.qk_normalization = config.qk_normalization
         if self.qk_normalization:
+            self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
         if self.use_flash_attn:
+            self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
         self.proj = nn.Linear(self.embed_dim, self.embed_dim)
     def _naive_attn(self, x):
         B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
         # make torchscript happy (cannot use tensor as tuple)
         q, k, v = qkv.unbind(0)
         if self.qk_normalization:
             B_, H_, N_, D_ = q.shape
+            q = (
+                self.q_norm(q.transpose(1, 2).flatten(-2, -1))
+                .view(B_, N_, H_, D_)
+                .transpose(1, 2)
+            )
+            k = (
+                self.k_norm(k.transpose(1, 2).flatten(-2, -1))
+                .view(B_, N_, H_, D_)
+                .transpose(1, 2)
+            )
+        attn = (q * self.scale) @ k.transpose(-2, -1)
         attn = attn.softmax(dim=-1)
         attn = self.attn_drop(attn)
     def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
         qkv = self.qkv(x)
+        qkv = rearrange(
+            qkv, "b s (three h d) -> b s three h d", three=3, h=self.num_heads
+        )
         if self.qk_normalization:
             q, k, v = qkv.unbind(2)
             qkv = torch.stack([q, k, v], dim=2)
         context, _ = self.inner_attn(
+            qkv,
+            key_padding_mask=key_padding_mask,
+            need_weights=need_weights,
+            causal=False,
         )
+        outs = self.proj(rearrange(context, "b s h d -> b s (h d)"))
         outs = self.proj_drop(outs)
         return outs
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x = (
+            self._naive_attn(hidden_states)
+            if not self.use_flash_attn
+            else self._flash_attn(hidden_states)
+        )
         return x
         self.attn = InternAttention(config)
         self.mlp = InternMLP(config)
+        self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+        self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+        self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.drop_path1 = (
+            DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        )
+        self.drop_path2 = (
+            DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        )
     def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> Tuple[
+        torch.FloatTensor,
+        Optional[torch.FloatTensor],
+        Optional[Tuple[torch.FloatTensor]],
+    ]:
         """
         Args:
             hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
         """
+        hidden_states = hidden_states + self.drop_path1(
+            self.attn(self.norm1(hidden_states)) * self.ls1
+        )
+        hidden_states = hidden_states + self.drop_path2(
+            self.mlp(self.norm2(hidden_states)) * self.ls2
+        )
         return hidden_states
         super().__init__()
         self.config = config
         # stochastic depth decay rule
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)
+        ]
+        self.layers = nn.ModuleList(
+            [
+                InternVisionEncoderLayer(config, dpr[idx])
+                for idx in range(config.num_hidden_layers)
+            ]
+        )
         self.gradient_checkpointing = True
     def forward(
+        self,
+        inputs_embeds,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutput]:
         r"""
         Args:
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
         output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
         )
         encoder_states = () if output_hidden_states else None
         hidden_states = inputs_embeds
                 encoder_states = encoder_states + (hidden_states,)
             if self.gradient_checkpointing and self.training:
                 layer_outputs = torch.utils.checkpoint.checkpoint(
+                    encoder_layer, hidden_states
+                )
             else:
                 layer_outputs = encoder_layer(
                     hidden_states,
 class InternVisionModel(PreTrainedModel):
+    main_input_name = "pixel_values"
     config_class = InternVisionConfig
+    _no_split_modules = ["InternVisionEncoderLayer"]
     def __init__(self, config: InternVisionConfig):
         super().__init__(config)
         pos_emb = self.embeddings.position_embedding
         _, num_positions, embed_dim = pos_emb.shape
         cls_emb = pos_emb[:, :1, :]
+        pos_emb = (
+            pos_emb[:, 1:, :]
+            .reshape(1, old_size // patch_size, old_size // patch_size, -1)
+            .permute(0, 3, 1, 2)
+        )
+        pos_emb = F.interpolate(
+            pos_emb.float(),
+            size=new_size // patch_size,
+            mode="bicubic",
+            align_corners=False,
+        )
+        pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
         pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
         self.embeddings.position_embedding = nn.Parameter(pos_emb)
         self.embeddings.image_size = new_size
+        logger.info(
+            "Resized position embeddings from {} to {}".format(old_size, new_size)
+        )
     def get_input_embeddings(self):
         return self.embeddings
     def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        pixel_embeds: Optional[torch.FloatTensor] = None,
     ) -> Union[Tuple, BaseModelOutputWithPooling]:
         output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
         )
         if pixel_values is None and pixel_embeds is None:
+            raise ValueError("You have to specify pixel_values or pixel_embeds")
         if pixel_embeds is not None:
             hidden_states = pixel_embeds
             if len(pixel_values.shape) == 4:
                 hidden_states = self.embeddings(pixel_values)
             else:
+                raise ValueError(f"wrong pixel_values size: {pixel_values.shape}")
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
             output_hidden_states=output_hidden_states,