feat-support-task

by bwang0911 - opened Nov 11, 2024

base: refs/heads/main

←

from: refs/pr/7

Discussion Files changed

+488

-652

Files changed (9) hide show

.gitignore +0 -70
README.md +4 -21
configuration_clip.py +6 -22
eva_model.py +27 -30
hf_model.py +102 -169
modeling_clip.py +160 -241
processing_clip.py +1 -0
rope_embeddings.py +9 -4
transform.py +179 -95

.gitignore DELETED Viewed

@@ -1,70 +0,0 @@
-# Project specific
-__init__.py
-pyproject.toml
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-# C extensions
-*.so
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-pip-wheel-metadata/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-# Jupyter Notebook
-.ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-# PyCharm
-.idea/

README.md CHANGED Viewed

@@ -1,27 +1,10 @@
----
-tags:
-- transformers
-- xlm-roberta
-- eva02
-- clip
-library_name: transformers
-license: cc-by-nc-4.0
----
 # Jina CLIP
-Core implementation of Jina CLIP. The model uses:
-* the [EVA 02](https://github.com/baaivision/EVA/tree/master/EVA-CLIP/rei/eva_clip) architecture for the vision tower
-* the [Jina XLM RoBERTa with Flash Attention](https://huggingface.co/jinaai/xlm-roberta-flash-implementation) model as a text tower
-## Models that use this implementation
-- [jinaai/jina-clip-v2](https://huggingface.co/jinaai/jina-clip-v2)
-- [jinaai/jina-clip-v1](https://huggingface.co/jinaai/jina-clip-v1)
-## Requirements
-To use the Jina CLIP source code, the following packages are required:
 * `torch`
 * `timm`
 * `transformers`

 # Jina CLIP
+The Jina CLIP implementation is hosted in this repository. The model uses:
+* the EVA 02 architecture for the vision tower
+* the Jina BERT with Flash Attention model as a text tower
+To use the Jina CLIP model, the following packages are required:
 * `torch`
 * `timm`
 * `transformers`

configuration_clip.py CHANGED Viewed

@@ -8,7 +8,6 @@ import os
 from copy import deepcopy
 from typing import Any, Dict, List, Optional, Union
-import torch
 from transformers import PretrainedConfig, logging
 logger = logging.get_logger(__name__)
@@ -25,8 +24,6 @@ class JinaCLIPTextConfig(PretrainedConfig):
         embed_dim: int = 768,
         hf_model_name_or_path: str = 'jinaai/jina-bert-flash-implementation',
         hf_model_config_kwargs: Optional[Dict[str, Any]] = None,
-        default_instruction_task: Optional[str] = None,
-        default_lora_task: Optional[str] = None,
         pooler_type: Optional[str] = None,
         proj_type: Optional[str] = None,
         proj_bias: bool = False,
@@ -37,8 +34,6 @@ class JinaCLIPTextConfig(PretrainedConfig):
         self.embed_dim = embed_dim
         self.hf_model_name_or_path = hf_model_name_or_path
         self.hf_model_config_kwargs = hf_model_config_kwargs or {}
-        self.default_instruction_task = default_instruction_task
-        self.default_lora_task = default_lora_task
         self.pooler_type = pooler_type
         self.proj_type = proj_type
         self.proj_bias = proj_bias
@@ -52,9 +47,11 @@ class JinaCLIPTextConfig(PretrainedConfig):
         configdict, kwargs = cls.get_config_dict(
             pretrained_model_name_or_path, **kwargs
         )
         # get the text config dict if we are loading from JinaCLIPConfig
         if configdict.get('model_type') == 'jina_clip':
             configdict = configdict['text_config']
         if (
             'model_type' in configdict
             and hasattr(cls, 'model_type')
@@ -65,6 +62,7 @@ class JinaCLIPTextConfig(PretrainedConfig):
                 f'instantiate a model of type {cls.model_type}. This is not supported '
                 'for all configurations of models and can yield errors.'
             )
         return cls.from_dict(configdict, **kwargs)
@@ -127,9 +125,11 @@ class JinaCLIPVisionConfig(PretrainedConfig):
         configdict, kwargs = cls.get_config_dict(
             pretrained_model_name_or_path, **kwargs
         )
         # get the vision config dict if we are loading from JinaCLIPConfig
         if configdict.get('model_type') == 'jina_clip':
             configdict = configdict['vision_config']
         if (
             'model_type' in configdict
             and hasattr(cls, 'model_type')
@@ -140,6 +140,7 @@ class JinaCLIPVisionConfig(PretrainedConfig):
                 f'instantiate a model of type {cls.model_type}. This is not supported '
                 'for all configurations of models and can yield errors.'
             )
         return cls.from_dict(configdict, **kwargs)
@@ -158,7 +159,6 @@ class JinaCLIPConfig(PretrainedConfig):
         use_vision_xformers: Optional[bool] = None,
         matryoshka_dimensions: Optional[List[int]] = None,
         truncate_dim: Optional[int] = None,
-        torch_dtype: Optional[Union[str, torch.dtype]] = None,
         **kwargs,
     ):
         # If `_config_dict` exist, we use them for the backward compatibility.
@@ -286,22 +286,6 @@ class JinaCLIPConfig(PretrainedConfig):
                     'projections with `add_projections=True`.'
                 )
-        if (
-            torch_dtype
-            and hasattr(torch, torch_dtype)
-            and type(getattr(torch, torch_dtype)) is torch.dtype
-        ):
-            self.torch_dtype = getattr(torch, torch_dtype)
-        else:
-            self.torch_dtype = torch_dtype
-        use_text_flash_attn = (
-            self.use_text_flash_attn if self.use_text_flash_attn is not None
-            else self.text_config.hf_model_config_kwargs.get('use_flash_attn', False)
-        )
-        if not use_text_flash_attn or not torch.cuda.is_available():
-            self.torch_dtype = torch.float32
     @classmethod
     def from_text_vision_configs(
         cls,

 from copy import deepcopy
 from typing import Any, Dict, List, Optional, Union
 from transformers import PretrainedConfig, logging
 logger = logging.get_logger(__name__)
         embed_dim: int = 768,
         hf_model_name_or_path: str = 'jinaai/jina-bert-flash-implementation',
         hf_model_config_kwargs: Optional[Dict[str, Any]] = None,
         pooler_type: Optional[str] = None,
         proj_type: Optional[str] = None,
         proj_bias: bool = False,
         self.embed_dim = embed_dim
         self.hf_model_name_or_path = hf_model_name_or_path
         self.hf_model_config_kwargs = hf_model_config_kwargs or {}
         self.pooler_type = pooler_type
         self.proj_type = proj_type
         self.proj_bias = proj_bias
         configdict, kwargs = cls.get_config_dict(
             pretrained_model_name_or_path, **kwargs
         )
         # get the text config dict if we are loading from JinaCLIPConfig
         if configdict.get('model_type') == 'jina_clip':
             configdict = configdict['text_config']
         if (
             'model_type' in configdict
             and hasattr(cls, 'model_type')
                 f'instantiate a model of type {cls.model_type}. This is not supported '
                 'for all configurations of models and can yield errors.'
             )
         return cls.from_dict(configdict, **kwargs)
         configdict, kwargs = cls.get_config_dict(
             pretrained_model_name_or_path, **kwargs
         )
         # get the vision config dict if we are loading from JinaCLIPConfig
         if configdict.get('model_type') == 'jina_clip':
             configdict = configdict['vision_config']
         if (
             'model_type' in configdict
             and hasattr(cls, 'model_type')
                 f'instantiate a model of type {cls.model_type}. This is not supported '
                 'for all configurations of models and can yield errors.'
             )
         return cls.from_dict(configdict, **kwargs)
         use_vision_xformers: Optional[bool] = None,
         matryoshka_dimensions: Optional[List[int]] = None,
         truncate_dim: Optional[int] = None,
         **kwargs,
     ):
         # If `_config_dict` exist, we use them for the backward compatibility.
                     'projections with `add_projections=True`.'
                 )
     @classmethod
     def from_text_vision_configs(
         cls,

eva_model.py CHANGED Viewed

@@ -5,19 +5,16 @@
 import math
 import os
-import warnings
 from functools import partial
 import torch
 import torch.nn as nn
-import torch.nn.functional as f
 try:
-    warnings.filterwarnings('ignore', category=FutureWarning, module='timm')
-    from timm.models.layers import drop_path as timm_drop_path
-    from timm.models.layers import to_2tuple, trunc_normal_
 except ImportError or ModuleNotFoundError:
-    from timm.layers import drop_path as timm_drop_path, to_2tuple, trunc_normal_
 from .rope_embeddings import VisionRotaryEmbeddingFast
@@ -84,7 +81,7 @@ class DropPath(nn.Module):
         self.drop_prob = drop_prob
     def forward(self, x):
-        return timm_drop_path(x, self.drop_prob, self.training)
     def extra_repr(self) -> str:
         return 'p={}'.format(self.drop_prob)
@@ -247,17 +244,17 @@ class Attention(nn.Module):
         self.rope = rope
     def forward(self, x, rel_pos_bias=None, attn_mask=None):
-        b, n, _ = x.shape
         if self.subln:
-            q = f.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias)
-            k = f.linear(input=x, weight=self.k_proj.weight, bias=None)
-            v = f.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias)
-            q = q.reshape(b, n, self.num_heads, -1).permute(
                 0, 2, 1, 3
             )  # B, num_heads, N, C
-            k = k.reshape(b, n, self.num_heads, -1).permute(0, 2, 1, 3)
-            v = v.reshape(b, n, self.num_heads, -1).permute(0, 2, 1, 3)
         else:
             qkv_bias = None
             if self.q_bias is not None:
@@ -269,8 +266,8 @@ class Attention(nn.Module):
                     )
                 )
-            qkv = f.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
-            qkv = qkv.reshape(b, n, 3, self.num_heads, -1).permute(
                 2, 0, 3, 1, 4
             )  # 3, B, num_heads, N, C
             q, k, v = qkv[0], qkv[1], qkv[2]
@@ -301,7 +298,7 @@ class Attention(nn.Module):
                 p=self.xattn_drop,
                 scale=self.scale,
             )
-            x = x.reshape(b, n, -1)
             x = self.inner_attn_ln(x)
             x = self.proj(x)
             x = self.proj_drop(x)
@@ -332,7 +329,7 @@ class Attention(nn.Module):
             attn = attn.softmax(dim=-1)
             attn = self.attn_drop(attn)
-            x = (attn @ v).transpose(1, 2).reshape(b, n, -1)
             x = self.inner_attn_ln(x)
             x = self.proj(x)
             x = self.proj_drop(x)
@@ -464,12 +461,12 @@ class PatchEmbed(nn.Module):
             in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
         )
-    def forward(self, x, **_):
         target_dtype = self.proj.weight.dtype
-        _, __, h, w = x.shape
         # FIXME look at relaxing size constraints
-        assert h == self.img_size[0] and w == self.img_size[1], (
-            f"Input image size ({h}*{w}) doesn't match model "
             f'({self.img_size[0]}*{self.img_size[1]}).'
         )
         x = self.proj(x.to(dtype=target_dtype)).flatten(2).transpose(1, 2)
@@ -562,8 +559,9 @@ class EVAVisionTransformer(nn.Module):
         super().__init__()
         self.image_size = img_size
         self.num_classes = num_classes
-        # num_features for consistency with other models
-        self.num_features = self.embed_dim = embed_dim
         self.patch_embed = PatchEmbed(
             img_size=img_size,
@@ -668,8 +666,8 @@ class EVAVisionTransformer(nn.Module):
         self.grad_checkpointing = grad_checkpointing
     def fix_init_weight(self):
-        def rescale(param, _layer_id):
-            param.div_(math.sqrt(2.0 * _layer_id))
         for layer_id, layer in enumerate(self.blocks):
             rescale(layer.attn.proj.weight.data, layer_id + 1)
@@ -681,8 +679,7 @@ class EVAVisionTransformer(nn.Module):
     def get_cast_dtype(self) -> torch.dtype:
         return self.blocks[0].mlp.fc2.weight.dtype
-    @staticmethod
-    def _init_weights(m):
         if isinstance(m, nn.Linear):
             trunc_normal_(m.weight, std=0.02)
             if m.bias is not None:
@@ -694,7 +691,7 @@ class EVAVisionTransformer(nn.Module):
     def get_num_layers(self):
         return len(self.blocks)
-    def lock(self, unlocked_groups=0, *_, **__):
         assert (
             unlocked_groups == 0
         ), 'partial locking not currently supported for this model'
@@ -712,7 +709,7 @@ class EVAVisionTransformer(nn.Module):
     def get_classifier(self):
         return self.head
-    def reset_classifier(self, num_classes, *_, **__):
         self.num_classes = num_classes
         self.head = (
             nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()

 import math
 import os
 from functools import partial
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 try:
+    from timm.models.layers import drop_path, to_2tuple, trunc_normal_
 except ImportError or ModuleNotFoundError:
+    from timm.layers import drop_path, to_2tuple, trunc_normal_
 from .rope_embeddings import VisionRotaryEmbeddingFast
         self.drop_prob = drop_prob
     def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
     def extra_repr(self) -> str:
         return 'p={}'.format(self.drop_prob)
         self.rope = rope
     def forward(self, x, rel_pos_bias=None, attn_mask=None):
+        B, N, C = x.shape
         if self.subln:
+            q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias)
+            k = F.linear(input=x, weight=self.k_proj.weight, bias=None)
+            v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias)
+            q = q.reshape(B, N, self.num_heads, -1).permute(
                 0, 2, 1, 3
             )  # B, num_heads, N, C
+            k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
+            v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
         else:
             qkv_bias = None
             if self.q_bias is not None:
                     )
                 )
+            qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+            qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(
                 2, 0, 3, 1, 4
             )  # 3, B, num_heads, N, C
             q, k, v = qkv[0], qkv[1], qkv[2]
                 p=self.xattn_drop,
                 scale=self.scale,
             )
+            x = x.reshape(B, N, -1)
             x = self.inner_attn_ln(x)
             x = self.proj(x)
             x = self.proj_drop(x)
             attn = attn.softmax(dim=-1)
             attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
             x = self.inner_attn_ln(x)
             x = self.proj(x)
             x = self.proj_drop(x)
             in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
         )
+    def forward(self, x, **kwargs):
         target_dtype = self.proj.weight.dtype
+        B, C, H, W = x.shape
         # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], (
+            f"Input image size ({H}*{W}) doesn't match model "
             f'({self.img_size[0]}*{self.img_size[1]}).'
         )
         x = self.proj(x.to(dtype=target_dtype)).flatten(2).transpose(1, 2)
         super().__init__()
         self.image_size = img_size
         self.num_classes = num_classes
+        self.num_features = (
+            self.embed_dim
+        ) = embed_dim  # num_features for consistency with other models
         self.patch_embed = PatchEmbed(
             img_size=img_size,
         self.grad_checkpointing = grad_checkpointing
     def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
         for layer_id, layer in enumerate(self.blocks):
             rescale(layer.attn.proj.weight.data, layer_id + 1)
     def get_cast_dtype(self) -> torch.dtype:
         return self.blocks[0].mlp.fc2.weight.dtype
+    def _init_weights(self, m):
         if isinstance(m, nn.Linear):
             trunc_normal_(m.weight, std=0.02)
             if m.bias is not None:
     def get_num_layers(self):
         return len(self.blocks)
+    def lock(self, unlocked_groups=0, freeze_bn_stats=False):
         assert (
             unlocked_groups == 0
         ), 'partial locking not currently supported for this model'
     def get_classifier(self):
         return self.head
+    def reset_classifier(self, num_classes, global_pool=''):
         self.num_classes = num_classes
         self.head = (
             nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()

hf_model.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import re
-import warnings
-from typing import Dict, Optional, Union
 import torch
 import torch.nn as nn
@@ -11,6 +10,10 @@ from transformers.modeling_outputs import (
     BaseModelOutputWithPoolingAndCrossAttentions,
 )
 _HF_ARCH_DICT = {
     # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
     'roberta': {
@@ -38,6 +41,22 @@ _HF_ARCH_DICT = {
         },
         'pooler': 'mean_pooler',
     },
     # https://huggingface.co/docs/transformers/model_doc/bert
     'bert': {
         'config_names': {
@@ -49,8 +68,24 @@ _HF_ARCH_DICT = {
         },
         'pooler': 'cls_pooler',
     },
 }
 _POOLERS = {}
@@ -66,6 +101,8 @@ def register_pooler(cls):
 @register_pooler
 class MeanPooler(nn.Module):
     @staticmethod
     def forward(x: BaseModelOutput, attention_mask: torch.Tensor):
         masked_output = x.last_hidden_state * attention_mask.unsqueeze(-1)
@@ -74,6 +111,10 @@ class MeanPooler(nn.Module):
 @register_pooler
 class MaxPooler(nn.Module):
     @staticmethod
     def forward(x: BaseModelOutput, attention_mask: torch.Tensor):
         masked_output = x.last_hidden_state.masked_fill(
@@ -84,7 +125,11 @@ class MaxPooler(nn.Module):
 @register_pooler
 class ClsPooler(nn.Module):
-    def __init__(self, use_pooler_output: bool = True):
         super().__init__()
         self.cls_token_position = 0
         self.use_pooler_output = use_pooler_output
@@ -102,9 +147,15 @@ class ClsPooler(nn.Module):
             and (x.pooler_output is not None)
         ):
             return x.pooler_output
         return x.last_hidden_state[:, self.cls_token_position, :]
 class HFTextEncoder(nn.Module):
     output_tokens: torch.jit.Final[bool]
@@ -120,60 +171,56 @@ class HFTextEncoder(nn.Module):
         output_tokens: bool = False,
         trust_remote_code: bool = False,
         revision: Optional[str] = None,
-        code_revision: Optional[str] = None,
-        default_instruction_task: Optional[str] = None,
-        default_lora_task: Optional[str] = None,
         model_config_kwargs: Optional[Dict] = None,
     ):
         super().__init__()
         self.output_tokens = output_tokens
         self.output_dim = output_dim
         model_config_kwargs = model_config_kwargs or {}
         if config is None:
-            if pretrained:
-                self.transformer = AutoModel.from_pretrained(
-                    model_name_or_path,
-                    trust_remote_code=trust_remote_code,
-                    revision=revision,
-                    add_pooling_layer=False,
-                    code_revision=code_revision,
-                    **model_config_kwargs,
-                )
-                self.config = self.transformer.config
-            else:
-                self.config = AutoConfig.from_pretrained(
-                    model_name_or_path,
-                    trust_remote_code=trust_remote_code,
-                    code_revision=code_revision,
-                )
-                self.config.update(model_config_kwargs)
-                self.transformer = AutoModel.from_config(
-                    self.config,
-                    trust_remote_code=trust_remote_code,
-                    add_pooling_layer=False,
-                    code_revision=code_revision,
-                )
             if (
                 hasattr(self.config, 'is_encoder_decoder')
                 and self.config.is_encoder_decoder
             ):
                 self.transformer = self.transformer.encoder
         else:
             self.config = config
             self.config.update(model_config_kwargs)
-            self.transformer = AutoModel.from_config(
-                self.config,
-                trust_remote_code=trust_remote_code,
-                revision=revision,
-                code_revision=code_revision,
-            )
         self.vocab_size = getattr(self.config, 'vocab_size', 0)
         self.context_length = getattr(self.config, 'max_position_embeddings', 0)
-        pooler_type = pooler_type or _HF_ARCH_DICT[self.config.model_type]['pooler']
         self.pooler = _POOLERS[pooler_type]()
         d_model = getattr(
@@ -181,7 +228,7 @@ class HFTextEncoder(nn.Module):
         )
         if (d_model == output_dim) and (proj_type is None):  # do we always need a proj?
             self.proj = nn.Identity()
-        elif (d_model != output_dim) or proj_type == 'linear':
             self.proj = nn.Linear(d_model, output_dim, bias=proj_bias)
         elif proj_type == 'mlp':
             hidden_size = (d_model + output_dim) // 2
@@ -191,149 +238,27 @@ class HFTextEncoder(nn.Module):
                 nn.Linear(hidden_size, output_dim, bias=proj_bias),
             )
-        self._task_instructions = {}
-        self._lora_adaptation_map = {}
-        self._supports_task_instructions = False
-        self._supports_lora = False
-        if (
-            hasattr(self.transformer, '_adaptation_map')
-            and len(self.transformer._adaptation_map) > 0
-        ):
-            self._lora_adaptation_map = self.transformer._adaptation_map
-            self._supports_lora = True
-        if (
-            hasattr(self.transformer, '_task_instructions')
-            and len(self.transformer._task_instructions) > 0
-        ):
-            self._task_instructions = self.transformer._task_instructions
-            self._supports_task_instructions = True
-        self._default_instruction_task = None
-        self._default_lora_task = None
-        self._default_instruction = None
-        self._default_loraid = None
-        if default_instruction_task is not None:
-            self._default_instruction_task = default_instruction_task
-            self._default_instruction = self.get_instruction_from_task(
-                default_instruction_task
-            )
-        if default_lora_task is not None:
-            self._default_lora_task = default_lora_task
-            self._default_loraid = self.get_loraid_from_task(default_lora_task)
-    @property
-    def supports_task_instructions(self) -> bool:
-        return self._supports_task_instructions
-    @property
-    def supports_lora(self) -> bool:
-        return self._supports_lora
-    @property
-    def task_instructions(self) -> Dict[str, str]:
-        return self._task_instructions
-    @property
-    def lora_adaptation_map(self) -> Dict[str, int]:
-        return self._lora_adaptation_map
-    @property
-    def default_instruction(self) -> Optional[str]:
-        return self._default_instruction
-    @property
-    def default_loraid(self) -> Optional[int]:
-        return self._default_loraid
-    def get_instruction_from_task(self, task: Optional[str]) -> Optional[str]:
-        if self._supports_task_instructions:
-            if task is None:
-                return self._default_instruction
-            if task not in self._task_instructions:
-                raise ValueError(
-                    f'Unsupported task \'{task}\'. Choose one of the following: '
-                    f'{", ".join(self._task_instructions)} or set to None to disable '
-                    f'task instructions completely'
-                )
-            return self._task_instructions[task]
-        else:
-            if task is not None:
-                warnings.warn(
-                    'Model does not support task instructions, ignoring instruction '
-                    f"task '{task}'"
-                )
-        return None
-    def get_loraid_from_task(self, task: Optional[str]) -> Optional[int]:
-        if self._supports_lora:
-            if task is None:
-                return self._default_loraid
-            if task not in self._lora_adaptation_map:
-                raise ValueError(
-                    f'Unsupported task \'{task}\'. Choose one of the following: '
-                    f'{", ".join(self._task_instructions)} or set to None to disable '
-                    f'the LoRA adapters completely'
-                )
-            return self._lora_adaptation_map[task]
-        else:
-            if task is not None:
-                warnings.warn(
-                    f"Model does not support LoRA adapters, ignoring LoRA task '{task}'"
-                )
-        return None
-    @staticmethod
-    def get_adapter_mask_from_loraid(
-        batch_size: int, loraid: int, device: Union[str, torch.device]
-    ):
-        return torch.full((batch_size,), loraid, dtype=torch.int32, device=device)
-    @torch.jit.ignore
-    def set_grad_checkpointing(self, _=True):
-        self.transformer.gradient_checkpointing_enable()
-    def init_parameters(self):
-        pass
-    def forward(self, x: torch.Tensor, adapter_mask: Optional[torch.Tensor] = None):
-        if adapter_mask is None:
-            default_loraid = self.default_loraid
-            if default_loraid is not None:
-                adapter_mask = self.get_adapter_mask_from_loraid(
-                    x.shape[0], default_loraid, x.device
-                )
-        else:
-            if not self.supports_lora:
-                warnings.warn(
-                    'Model does not support LoRA adapters, setting adapter_mask to None'
-                )
-                adapter_mask = None
-        attention_mask = (x != self.config.pad_token_id).long()
-        lora_kwargs = {}
-        if adapter_mask is not None:
-            lora_kwargs['adapter_mask'] = adapter_mask
-        out = self.transformer(
-            input_ids=x, attention_mask=attention_mask, **lora_kwargs
-        )
-        pooled_out = self.pooler(out, attention_mask)
         projected = self.proj(pooled_out)
-        seqlen = out.last_hidden_state.shape[1]
         tokens = (
             out.last_hidden_state[
-                :, torch.arange(seqlen) != self.pooler.cls_token_position, :
             ]
             if isinstance(self.pooler, ClsPooler)
             else out.last_hidden_state
         )
         if self.output_tokens:
             return projected, tokens
         return projected
     def lock(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True):
-        if not unlocked_layers:
             for n, p in self.transformer.named_parameters():
                 p.requires_grad = (
                     (not freeze_layer_norm) if 'LayerNorm' in n.split('.') else False
@@ -362,3 +287,11 @@ class HFTextEncoder(nn.Module):
                 p.requires_grad = (
                     (not freeze_layer_norm) if 'LayerNorm' in n.split('.') else False
                 )

 import re
+from typing import Dict, Optional, Tuple
 import torch
 import torch.nn as nn
     BaseModelOutputWithPoolingAndCrossAttentions,
 )
+"""
+HF architecture mapping
+"""
 _HF_ARCH_DICT = {
     # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
     'roberta': {
         },
         'pooler': 'mean_pooler',
     },
+    # https://huggingface.co/docs/transformers/model_doc/mt5#mt5
+    'mt5': {
+        'config_names': {
+            # unlimited seqlen
+            # https://github.com/google-research/text-to-text-transfer-transformer/issues/273
+            # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374
+            'context_length': '',
+            'vocab_size': 'vocab_size',
+            'width': 'd_model',
+            'heads': 'num_heads',
+            'layers': 'num_layers',
+            'layer_attr': 'block',
+            'token_embeddings_attr': 'embed_tokens',
+        },
+        'pooler': 'mean_pooler',
+    },
     # https://huggingface.co/docs/transformers/model_doc/bert
     'bert': {
         'config_names': {
         },
         'pooler': 'cls_pooler',
     },
+    # https://huggingface.co/docs/transformers/model_doc/m2m_100
+    'm2m_100': {
+        'config_names': {
+            'context_length': 'max_position_embeddings',
+            'vocab_size': 'vocab_size',
+            'width': 'd_model',
+            'heads': 'encoder_attention_heads',
+            'layers': 'encoder_layers',
+        },
+        'pooler': 'cls_pooler',
+    },
 }
+"""
+Pooling functions
+"""
 _POOLERS = {}
 @register_pooler
 class MeanPooler(nn.Module):
+    """Mean pooling"""
     @staticmethod
     def forward(x: BaseModelOutput, attention_mask: torch.Tensor):
         masked_output = x.last_hidden_state * attention_mask.unsqueeze(-1)
 @register_pooler
 class MaxPooler(nn.Module):
+    """
+    Max pooling
+    """
     @staticmethod
     def forward(x: BaseModelOutput, attention_mask: torch.Tensor):
         masked_output = x.last_hidden_state.masked_fill(
 @register_pooler
 class ClsPooler(nn.Module):
+    """
+    CLS token pooling
+    """
+    def __init__(self, use_pooler_output=True):
         super().__init__()
         self.cls_token_position = 0
         self.use_pooler_output = use_pooler_output
             and (x.pooler_output is not None)
         ):
             return x.pooler_output
         return x.last_hidden_state[:, self.cls_token_position, :]
+"""
+HF text model
+"""
 class HFTextEncoder(nn.Module):
     output_tokens: torch.jit.Final[bool]
         output_tokens: bool = False,
         trust_remote_code: bool = False,
         revision: Optional[str] = None,
         model_config_kwargs: Optional[Dict] = None,
     ):
         super().__init__()
         self.output_tokens = output_tokens
         self.output_dim = output_dim
+        # TODO: find better way to get this information
+        uses_transformer_pooler = pooler_type == 'cls_pooler'
         model_config_kwargs = model_config_kwargs or {}
         if config is None:
+            self.config = AutoConfig.from_pretrained(
+                model_name_or_path,
+                trust_remote_code=trust_remote_code,
+                code_revision=revision,
+            )
+            self.config.update(model_config_kwargs)
+            create_func, model_args = (
+                (AutoModel.from_pretrained, model_name_or_path)
+                if pretrained
+                else (AutoModel.from_config, self.config)
+            )
+            # TODO: do all model configs have this attribute?
+            #  PretrainedConfig does so yes??
             if (
                 hasattr(self.config, 'is_encoder_decoder')
                 and self.config.is_encoder_decoder
             ):
+                self.transformer = create_func(model_args)
                 self.transformer = self.transformer.encoder
+            else:
+                self.transformer = create_func(
+                    model_args,
+                    trust_remote_code=trust_remote_code,
+                    add_pooling_layer=uses_transformer_pooler,
+                    code_revision=revision,
+                )
         else:
             self.config = config
             self.config.update(model_config_kwargs)
+            self.transformer = AutoModel.from_config(self.config)
+        if pooler_type is None:  # get default arch pooler
+            pooler_type = _HF_ARCH_DICT[self.config.model_type]['pooler']
+        # FIXME downstream users of OpenCLIP models use these attr,
+        #  need to verify valid across all models
         self.vocab_size = getattr(self.config, 'vocab_size', 0)
         self.context_length = getattr(self.config, 'max_position_embeddings', 0)
         self.pooler = _POOLERS[pooler_type]()
         d_model = getattr(
         )
         if (d_model == output_dim) and (proj_type is None):  # do we always need a proj?
             self.proj = nn.Identity()
+        elif proj_type == 'linear':
             self.proj = nn.Linear(d_model, output_dim, bias=proj_bias)
         elif proj_type == 'mlp':
             hidden_size = (d_model + output_dim) // 2
                 nn.Linear(hidden_size, output_dim, bias=proj_bias),
             )
+    def forward(self, x: torch.Tensor):
+        attn_mask = (x != self.config.pad_token_id).long()
+        out = self.transformer(input_ids=x, attention_mask=attn_mask)
+        pooled_out = self.pooler(out, attn_mask)
         projected = self.proj(pooled_out)
+        seq_len = out.last_hidden_state.shape[1]
         tokens = (
             out.last_hidden_state[
+                :, torch.arange(seq_len) != self.pooler.cls_token_position, :
             ]
             if isinstance(self.pooler, ClsPooler)
             else out.last_hidden_state
         )
         if self.output_tokens:
             return projected, tokens
         return projected
     def lock(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True):
+        if not unlocked_layers:  # full freezing
             for n, p in self.transformer.named_parameters():
                 p.requires_grad = (
                     (not freeze_layer_norm) if 'LayerNorm' in n.split('.') else False
                 p.requires_grad = (
                     (not freeze_layer_norm) if 'LayerNorm' in n.split('.') else False
                 )
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, _=True):
+        self.transformer.gradient_checkpointing_enable()
+    def init_parameters(self):
+        pass

modeling_clip.py CHANGED Viewed

@@ -5,8 +5,6 @@
 # and adjusted for Jina CLIP
 import base64
-import importlib.util
-import warnings
 from functools import partial
 from io import BytesIO
 from typing import List, Optional, Tuple, Union
@@ -16,7 +14,6 @@ import requests
 import torch
 import torch.nn.functional as f
 import torch.utils.checkpoint
-from PIL import Image
 from torch import nn
 from transformers import (
     AutoImageProcessor,
@@ -38,12 +35,13 @@ try:
     has_tqdm = True
 except ImportError:
-    trange = None
     has_tqdm = False
 from .configuration_clip import JinaCLIPConfig, JinaCLIPTextConfig, JinaCLIPVisionConfig
 from .eva_model import EVAVisionTransformer
 from .hf_model import HFTextEncoder
 from .rope_embeddings import VisionRotaryEmbeddingFast  # noqa: F401
 from .transform import (  # noqa: F401
     OPENAI_DATASET_MEAN,
@@ -70,8 +68,6 @@ def _build_text_tower(config: JinaCLIPTextConfig) -> HFTextEncoder:
     return HFTextEncoder(
         model_name_or_path=config.hf_model_name_or_path,
         output_dim=config.embed_dim,
-        default_instruction_task=config.default_instruction_task,
-        default_lora_task=config.default_lora_task,
         pooler_type=config.pooler_type,
         proj_type=config.proj_type,
         proj_bias=config.proj_bias,
@@ -119,80 +115,6 @@ def _build_vision_tower(config: JinaCLIPVisionConfig) -> EVAVisionTransformer:
     )
-def _resolve_attention_libs(config: JinaCLIPConfig):
-    use_text_flash_attn = (
-        config.use_text_flash_attn
-        if config.use_text_flash_attn is not None
-        else config.text_config.hf_model_config_kwargs.get('use_flash_attn', True)
-    )
-    use_vision_xformers = (
-        config.use_vision_xformers
-        if config.use_vision_xformers is not None
-        else config.vision_config.x_attention
-    )
-    def _resolve_use_text_flash_attn() -> bool:
-        if use_text_flash_attn:
-            if not torch.cuda.is_available():
-                warnings.warn('Flash attention requires CUDA, disabling')
-                return False
-            if importlib.util.find_spec('flash_attn') is None:
-                warnings.warn(
-                    'Flash attention is not installed. Check '
-                    'https://github.com/Dao-AILab/flash-attention?'
-                    'tab=readme-ov-file#installation-and-features '
-                    'for installation instructions, disabling'
-                )
-                return False
-            major, minor, *_ = torch.version.cuda.split('.')
-            major, minor = int(major), int(minor)
-            if major < 11 or (major == 11 and minor < 7):
-                warnings.warn(
-                    'Flash attention requires CUDA>=11.7. Found version '
-                    f'{major}.{minor}, disabling'
-                )
-                return False
-            capability = torch.cuda.get_device_capability()
-            major, *_ = capability
-            major = int(major)
-            if major < 8:
-                device_name = torch.cuda.get_device_properties(0).name
-                warnings.warn(
-                    'Flash attention requires device capability>=8.0 (NVIDIA Ampere, '
-                    f'Hopper or ADA). Found device {device_name} with capability '
-                    f'{capability}, disabling'
-                )
-                return False
-            return True
-        return False
-    def _resolve_use_vision_xformers() -> bool:
-        if use_vision_xformers:
-            if not torch.cuda.is_available():
-                warnings.warn('xFormers requires CUDA, disabling')
-                return False
-            if importlib.util.find_spec('xformers') is None:
-                warnings.warn(
-                    'xFormers is not installed. Check '
-                    'https://github.com/facebookresearch/xformers?'
-                    'tab=readme-ov-file#installing-xformers for installation '
-                    'instructions, disabling'
-                )
-                return False
-            return True
-        return False
-    _use_text_flash_attn = _resolve_use_text_flash_attn()
-    _use_vision_xformers = _resolve_use_vision_xformers()
-    config.use_text_flash_attn = _use_text_flash_attn
-    config.use_vision_xformers = _use_vision_xformers
-    config.text_config.hf_model_config_kwargs['use_flash_attn'] = _use_text_flash_attn
-    config.vision_config.x_attention = _use_vision_xformers
-    return config
 class JinaCLIPPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for
@@ -222,12 +144,6 @@ class JinaCLIPPreTrainedModel(PreTrainedModel):
         if isinstance(module, nn.Linear) and module.bias is not None:
             module.bias.data.zero_()
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        if 'torch_dtype' not in kwargs:
-            kwargs['torch_dtype'] = 'auto'
-        return super().from_pretrained(*args, **kwargs)
 class JinaCLIPTextModel(JinaCLIPPreTrainedModel):
     config_class = JinaCLIPTextConfig
@@ -300,19 +216,25 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
                 f'JinaCLIPVisionConfig but is of type {type(config.vision_config)}.'
             )
-        config = _resolve_attention_libs(config)
         text_config = config.text_config
         vision_config = config.vision_config
         self.add_projections = config.add_projections
         self.projection_dim = config.projection_dim
         self.text_embed_dim = text_config.embed_dim
         self.vision_embed_dim = vision_config.embed_dim
         self.text_model = _build_text_tower(text_config)
         self.vision_model = _build_vision_tower(vision_config)
         self.logit_scale = nn.Parameter(
             torch.tensor(self.config.logit_scale_init_value)
         )
         if self.add_projections:
             self.visual_projection = nn.Linear(
                 self.vision_embed_dim, self.projection_dim, bias=False
@@ -329,7 +251,7 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
         self.post_init()
     def get_tokenizer(self):
-        if self.tokenizer is None:
             self.tokenizer = AutoTokenizer.from_pretrained(
                 self.config._name_or_path, trust_remote_code=True
             )
@@ -364,24 +286,24 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
         )
         return self.visual_projection(self.vision_model(x=x))
-    def _truncate_embeddings(self, embeddings: torch.Tensor, truncate_dim: int):
         if not self.config.matryoshka_dimensions:
             logger.warning(
-                'Model is not trained using Matryoshka Representation Learning, '
-                'truncating embeddings will not work optimally.'
             )
-        return embeddings[:, :truncate_dim]
-    @staticmethod
-    def _decode_image_data(image_data_str: str) -> Image:
-        header, data = image_data_str.split(',', 1)
-        image_data = base64.b64decode(data)
-        return Image.open(BytesIO(image_data))
     @torch.inference_mode()
-    def encode_image(
         self,
-        images: Union[str, List[Union[str, 'Image.Image']]],
         batch_size: int = 32,
         show_progress_bar: Optional[bool] = None,
         convert_to_numpy: bool = True,
@@ -389,129 +311,122 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
         device: Optional[torch.device] = None,
         normalize_embeddings: bool = True,
         truncate_dim: Optional[int] = None,
     ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
         """
-        Computes image embeddings
-        Args:
-            images(`str` or `List[Union[str, Image.Image]]`):
-                Image paths, URLs, PIL images, or data:image/ strings to be encoded
-            batch_size(`int`, *optional*, defaults to 32):
-                Batch size for the computation
-            show_progress_bar(`bool`, *optional*, defaults to None):
-                Show a progress bar when encoding images. If set to None, progress bar
-                is only shown when `logger.level == logging.INFO` or
-                `logger.level == logging.DEBUG`
-            convert_to_numpy(`bool`, *optional*, defaults to True):
-                If true, the output is a list of numpy vectors. Else, it is a list of
-                pytorch tensors
-            convert_to_tensor(`bool`, *optional*, defaults to False):
-                If true, you get one large tensor as return. Overwrites any setting
-                from convert_to_numpy
-            device(`torch.device`, *optional*, defaults to None):
-                Which torch.device to use for the computation
-            normalize_embeddings(`bool`, *optional*, defaults to True):
-                If set to true, returned vectors will have length 1. In that case,
-                the faster dot-product (util.dot_score) instead of cosine similarity
-                can be used
-            truncate_dim(`int`, *optional*, defaults to None):
-                The dimension to truncate sentence embeddings to. If set to `None`
-                no truncation is performed
-        Returns:
-            By default, a list of tensors is returned. If convert_to_tensor, a stacked
-            tensor is returned. If convert_to_numpy, a numpy matrix is returned
         """
-        _is_training = self.training
         self.eval()
-        self.preprocess = self.get_preprocess()
         all_embeddings = []
         if show_progress_bar is None:
             show_progress_bar = (
                 logger.getEffectiveLevel() == logging.INFO
                 or logger.getEffectiveLevel() == logging.DEBUG
             )
         if convert_to_tensor:
             convert_to_numpy = False
-        _input_was_single_img = False
-        if isinstance(images, str) or not hasattr(images, '__len__'):
-            images = [images]
-            _input_was_single_img = True
         if device is not None:
             self.to(device)
-        _permutation = np.argsort([-len(str(i)) for i in images])
-        _inverse_permutation = np.argsort(_permutation)
-        images = [images[idx] for idx in _permutation]
         if has_tqdm:
             range_iter = trange(
                 0,
-                len(images),
                 batch_size,
                 desc='Encoding',
                 disable=not show_progress_bar,
             )
         else:
-            range_iter = range(0, len(images), batch_size)
         truncate_dim = truncate_dim or self.config.truncate_dim
         for i in range_iter:
-            _processed_images = []
-            for img in images[i: i + batch_size]:
-                if isinstance(img, str):
-                    if img.startswith('http'):
-                        response = requests.get(img)
-                        image = Image.open(BytesIO(response.content)).convert('RGB')
-                    elif img.startswith('data:image/'):
-                        image = self._decode_image_data(img).convert('RGB')
-                    else:
-                        image = Image.open(img).convert('RGB')
-                elif isinstance(img, Image.Image):
-                    image = img.convert('RGB')
-                else:
-                    raise ValueError('Unsupported image format')
-                _processed_images.append(image)
-            pixelvals = self.preprocess(_processed_images)
-            pixelvals = pixelvals.to(self.device)
-            embeddings = self.get_image_features(pixelvals)
             if truncate_dim:
-                embeddings = self._truncate_embeddings(embeddings, truncate_dim)
             if normalize_embeddings:
-                embeddings = f.normalize(embeddings, p=2, dim=1)
             if convert_to_numpy:
                 embeddings = embeddings.cpu()
             all_embeddings.extend(embeddings)
-        all_embeddings = [all_embeddings[idx] for idx in _inverse_permutation]
         if convert_to_tensor:
             all_embeddings = torch.stack(all_embeddings)
         elif convert_to_numpy:
-            all_embeddings = np.asarray(
-                [emb.to(torch.float32).numpy() for emb in all_embeddings]
-            )
-        if _input_was_single_img:
             all_embeddings = all_embeddings[0]
-        self.train(_is_training)
         return all_embeddings
     @torch.inference_mode()
-    def encode_text(
         self,
-        sentences: Union[str, List[str]],
-        task: Optional[str] = None,
         batch_size: int = 32,
         show_progress_bar: Optional[bool] = None,
         convert_to_numpy: bool = True,
@@ -519,119 +434,123 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
         device: Optional[torch.device] = None,
         normalize_embeddings: bool = True,
         truncate_dim: Optional[int] = None,
-        **tokenizer_kwargs,
     ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
         """
-        Computes text embeddings
         Args:
-            sentences(`str` or `List[str]`):
-                Sentence or sentences to be encoded
-            task(`str`, *optional*, defaults to `None`):
-                Specifies the task for which the encoding is intended. If a `task` is
-                provided, a task-specific instruction is added to the beginning of each
-                sentence. If `task` is not provided, no instructions are added.
             batch_size(`int`, *optional*, defaults to 32):
                 Batch size for the computation
             show_progress_bar(`bool`, *optional*, defaults to None):
-                Show a progress bar when encoding sentences. If set to None, progress
-                bar is only shown when `logger.level == logging.INFO` or
-                `logger.level == logging.DEBUG`
             convert_to_numpy(`bool`, *optional*, defaults to True):
-                If true, the output is a list of numpy vectors. Else, it is a list of
-                pytorch tensors
             convert_to_tensor(`bool`, *optional*, defaults to False):
-                If true, you get one large tensor as return. Overwrites any setting
-                from convert_to_numpy
             device(`torch.device`, *optional*, defaults to None):
                 Which torch.device to use for the computation
-            normalize_embeddings(`bool`, *optional*, defaults to True):
                 If set to true, returned vectors will have length 1. In that case,
                 the faster dot-product (util.dot_score) instead of cosine similarity
-                can be used
             truncate_dim(`int`, *optional*, defaults to None):
-                The dimension to truncate sentence embeddings to. If set to `None`
-                no truncation is performed
-            tokenizer_kwargs(`Dict[str, Any]`, *optional*, defaults to {}):
-                Keyword arguments for the tokenizer
         Returns:
-            By default, a list of tensors is returned. If convert_to_tensor, a stacked
-            tensor is returned. If convert_to_numpy, a numpy matrix is returned.
         """
-        _is_training = self.training
         self.eval()
         all_embeddings = []
-        self.tokenizer = self.get_tokenizer()
         if show_progress_bar is None:
             show_progress_bar = (
                 logger.getEffectiveLevel() == logging.INFO
                 or logger.getEffectiveLevel() == logging.DEBUG
             )
         if convert_to_tensor:
             convert_to_numpy = False
-        _input_was_string = False
-        if isinstance(sentences, str) or not hasattr(sentences, '__len__'):
-            sentences = [sentences]
-            _input_was_string = True
         if device is not None:
             self.to(device)
-        _permutation = np.argsort([-len(i) for i in sentences])
-        _inverse_permutation = np.argsort(_permutation)
-        sentences = [sentences[idx] for idx in _permutation]
-        tokenizer_kwargs['padding'] = tokenizer_kwargs.get('padding', True)
-        tokenizer_kwargs['max_length'] = tokenizer_kwargs.get('max_length', 512)
-        tokenizer_kwargs['truncation'] = tokenizer_kwargs.get('truncation', True)
         if has_tqdm:
             range_iter = trange(
                 0,
-                len(sentences),
                 batch_size,
                 desc='Encoding',
                 disable=not show_progress_bar,
             )
         else:
-            range_iter = range(0, len(sentences), batch_size)
-        truncate_dim = truncate_dim or self.config.truncate_dim
-        instruction = self.text_model.get_instruction_from_task(task)
-        if instruction:
-            sentences = [instruction + sentence for sentence in sentences]
         for i in range_iter:
-            tokens = self.tokenizer(
-                sentences[i: i + batch_size],
-                return_tensors='pt',
-                **tokenizer_kwargs,
-            ).to(self.device)
-            embeddings = self.get_text_features(input_ids=tokens)
             if truncate_dim:
-                embeddings = self._truncate_embeddings(embeddings, truncate_dim)
             if normalize_embeddings:
-                embeddings = f.normalize(embeddings, p=2, dim=1)
             if convert_to_numpy:
                 embeddings = embeddings.cpu()
             all_embeddings.extend(embeddings)
-        all_embeddings = [all_embeddings[idx] for idx in _inverse_permutation]
         if convert_to_tensor:
             all_embeddings = torch.stack(all_embeddings)
         elif convert_to_numpy:
-            all_embeddings = np.asarray(
-                [emb.to(torch.float32).numpy() for emb in all_embeddings]
-            )
-        if _input_was_string:
             all_embeddings = all_embeddings[0]
-        self.train(_is_training)
         return all_embeddings
     def forward(

 # and adjusted for Jina CLIP
 import base64
 from functools import partial
 from io import BytesIO
 from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as f
 import torch.utils.checkpoint
 from torch import nn
 from transformers import (
     AutoImageProcessor,
     has_tqdm = True
 except ImportError:
     has_tqdm = False
 from .configuration_clip import JinaCLIPConfig, JinaCLIPTextConfig, JinaCLIPVisionConfig
 from .eva_model import EVAVisionTransformer
 from .hf_model import HFTextEncoder
+# needed for HF to correctly import in cache
 from .rope_embeddings import VisionRotaryEmbeddingFast  # noqa: F401
 from .transform import (  # noqa: F401
     OPENAI_DATASET_MEAN,
     return HFTextEncoder(
         model_name_or_path=config.hf_model_name_or_path,
         output_dim=config.embed_dim,
         pooler_type=config.pooler_type,
         proj_type=config.proj_type,
         proj_bias=config.proj_bias,
     )
 class JinaCLIPPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for
         if isinstance(module, nn.Linear) and module.bias is not None:
             module.bias.data.zero_()
 class JinaCLIPTextModel(JinaCLIPPreTrainedModel):
     config_class = JinaCLIPTextConfig
                 f'JinaCLIPVisionConfig but is of type {type(config.vision_config)}.'
             )
         text_config = config.text_config
         vision_config = config.vision_config
+        if config.use_text_flash_attn is not None:
+            text_config.hf_model_config_kwargs['use_flash_attn'] = config.use_text_flash_attn
+        if config.use_vision_xformers is not None:
+            vision_config.x_attention = config.use_vision_xformers
         self.add_projections = config.add_projections
         self.projection_dim = config.projection_dim
         self.text_embed_dim = text_config.embed_dim
         self.vision_embed_dim = vision_config.embed_dim
         self.text_model = _build_text_tower(text_config)
         self.vision_model = _build_vision_tower(vision_config)
         self.logit_scale = nn.Parameter(
             torch.tensor(self.config.logit_scale_init_value)
         )
         if self.add_projections:
             self.visual_projection = nn.Linear(
                 self.vision_embed_dim, self.projection_dim, bias=False
         self.post_init()
     def get_tokenizer(self):
+        if not self.tokenizer:
             self.tokenizer = AutoTokenizer.from_pretrained(
                 self.config._name_or_path, trust_remote_code=True
             )
         )
         return self.visual_projection(self.vision_model(x=x))
+    def truncate_embeddings(self, embeddings, truncate_dim):
         if not self.config.matryoshka_dimensions:
             logger.warning(
+                "Matryoshka embeddings are not supported, so dimension truncation will not be performed."
+            )
+            return embeddings
+        elif truncate_dim in self.config.matryoshka_dimensions:
+            return embeddings[:, :truncate_dim]
+        else:
+            raise ValueError(
+                f"The provided `truncate_dim` value of {truncate_dim} is not supported. "
+                f"Supported dimensions are {self.config.matryoshka_dimensions}."
             )
     @torch.inference_mode()
+    def encode_text(
         self,
+        sentences: Union[str, List[str]],
         batch_size: int = 32,
         show_progress_bar: Optional[bool] = None,
         convert_to_numpy: bool = True,
         device: Optional[torch.device] = None,
         normalize_embeddings: bool = True,
         truncate_dim: Optional[int] = None,
+        **tokenizer_kwargs,
     ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
         """
+        Computes sentence embeddings
+         Args:
+             sentences(`str` or `List[str]`):
+                 Sentence or sentences to be encoded
+             batch_size(`int`, *optional*, defaults to 32):
+                 Batch size for the computation
+             show_progress_bar(`bool`, *optional*, defaults to None):
+                 Show a progress bar when encoding sentences.
+                 If set to None, progress bar is only shown when
+                 `logger.level == logging.INFO` or `logger.level == logging.DEBUG`.
+             convert_to_numpy(`bool`, *optional*, defaults to True):
+                 If true, the output is a list of numpy vectors.
+                 Else, it is a list of pytorch tensors.
+             convert_to_tensor(`bool`, *optional*, defaults to False):
+                 If true, you get one large tensor as return.
+                 Overwrites any setting from convert_to_numpy
+             device(`torch.device`, *optional*, defaults to None):
+                 Which torch.device to use for the computation
+             normalize_embeddings(`bool`, *optional*, defaults to False):
+                 If set to true, returned vectors will have length 1. In that case,
+                 the faster dot-product (util.dot_score) instead of cosine similarity
+                 can be used.
+             truncate_dim(`int`, *optional*, defaults to None):
+                The dimension to truncate sentence embeddings to. `None` does no truncation.
+             tokenizer_kwargs(`Dict[str, Any]`, *optional*, defaults to {}):
+                 Keyword arguments for the tokenizer
+         Returns:
+             By default, a list of tensors is returned.
+             If convert_to_tensor, a stacked tensor is returned.
+             If convert_to_numpy, a numpy matrix is returned.
         """
+        is_training = self.training
         self.eval()
         all_embeddings = []
+        self.tokenizer = self.get_tokenizer()
         if show_progress_bar is None:
             show_progress_bar = (
                 logger.getEffectiveLevel() == logging.INFO
                 or logger.getEffectiveLevel() == logging.DEBUG
             )
         if convert_to_tensor:
             convert_to_numpy = False
+        input_was_string = False
+        if isinstance(sentences, str) or not hasattr(sentences, '__len__'):
+            sentences = [sentences]
+            input_was_string = True
         if device is not None:
             self.to(device)
+        permutation = np.argsort([-len(i) for i in sentences])
+        inverse_permutation = np.argsort(permutation)
+        sentences = [sentences[idx] for idx in permutation]
+        tokenizer_kwargs['padding'] = tokenizer_kwargs.get('padding', True)
+        tokenizer_kwargs['max_length'] = tokenizer_kwargs.get('max_length', 512)
+        tokenizer_kwargs['truncation'] = tokenizer_kwargs.get('truncation', True)
         if has_tqdm:
             range_iter = trange(
                 0,
+                len(sentences),
                 batch_size,
                 desc='Encoding',
                 disable=not show_progress_bar,
             )
         else:
+            range_iter = range(0, len(sentences), batch_size)
         truncate_dim = truncate_dim or self.config.truncate_dim
         for i in range_iter:
+            encoded_input = self.tokenizer(
+                sentences[i : i + batch_size],
+                return_tensors='pt',
+                **tokenizer_kwargs,
+            ).to(self.device)
+            embeddings = self.get_text_features(input_ids=encoded_input)
             if truncate_dim:
+                embeddings = self.truncate_embeddings(embeddings, truncate_dim)
             if normalize_embeddings:
+                embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
             if convert_to_numpy:
                 embeddings = embeddings.cpu()
             all_embeddings.extend(embeddings)
+        all_embeddings = [all_embeddings[idx] for idx in inverse_permutation]
         if convert_to_tensor:
             all_embeddings = torch.stack(all_embeddings)
         elif convert_to_numpy:
+            all_embeddings = np.asarray([emb.to(torch.float32).numpy() for emb in all_embeddings])
+        if input_was_string:
             all_embeddings = all_embeddings[0]
+        self.train(is_training)
         return all_embeddings
+    def decode_data_image(data_image_str):
+        header, data = data_image_str.split(',', 1)
+        image_data = base64.b64decode(data)
+        return Image.open(BytesIO(image_data))
     @torch.inference_mode()
+    def encode_image(
         self,
+        images: Union[str, List[Union[str, "Image.Image"]]],
         batch_size: int = 32,
         show_progress_bar: Optional[bool] = None,
         convert_to_numpy: bool = True,
         device: Optional[torch.device] = None,
         normalize_embeddings: bool = True,
         truncate_dim: Optional[int] = None,
     ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
         """
+        Computes image embeddings.
         Args:
+            images(`str` or `List[Union[str, Image.Image]]`):
+                image paths, URLs, PIL images, or data:image/ strings to be encoded
             batch_size(`int`, *optional*, defaults to 32):
                 Batch size for the computation
             show_progress_bar(`bool`, *optional*, defaults to None):
+                Show a progress bar when encoding images.
+                If set to None, progress bar is only shown when
+                `logger.level == logging.INFO` or `logger.level == logging.DEBUG`.
             convert_to_numpy(`bool`, *optional*, defaults to True):
+                If true, the output is a list of numpy vectors.
+                Else, it is a list of pytorch tensors.
             convert_to_tensor(`bool`, *optional*, defaults to False):
+                If true, you get one large tensor as return.
+                Overwrites any setting from convert_to_numpy
             device(`torch.device`, *optional*, defaults to None):
                 Which torch.device to use for the computation
+            normalize_embeddings(`bool`, *optional*, defaults to False):
                 If set to true, returned vectors will have length 1. In that case,
                 the faster dot-product (util.dot_score) instead of cosine similarity
+                can be used.
             truncate_dim(`int`, *optional*, defaults to None):
+                The dimension to truncate sentence embeddings to. `None` does no truncation.
         Returns:
+            By default, a list of tensors is returned.
+            If convert_to_tensor, a stacked tensor is returned.
+            If convert_to_numpy, a numpy matrix is returned.
         """
+        is_training = self.training
         self.eval()
+        self.preprocess = self.get_preprocess()
         all_embeddings = []
         if show_progress_bar is None:
             show_progress_bar = (
                 logger.getEffectiveLevel() == logging.INFO
                 or logger.getEffectiveLevel() == logging.DEBUG
             )
         if convert_to_tensor:
             convert_to_numpy = False
+        input_was_single_img = False
+        if isinstance(images, str) or not hasattr(images, '__len__'):
+            images = [images]
+            input_was_single_img = True
         if device is not None:
             self.to(device)
+        permutation = np.argsort([-len(str(i)) for i in images])
+        inverse_permutation = np.argsort(permutation)
+        images = [images[idx] for idx in permutation]
         if has_tqdm:
             range_iter = trange(
                 0,
+                len(images),
                 batch_size,
                 desc='Encoding',
                 disable=not show_progress_bar,
             )
         else:
+            range_iter = range(0, len(images), batch_size)
+        from PIL import Image
+        truncate_dim = truncate_dim or self.config.truncate_dim
         for i in range_iter:
+            batch_images = images[i:i+batch_size]
+            processed_inputs = []
+            for img in batch_images:
+                if isinstance(img, str):
+                    if img.startswith('http'):
+                        response = requests.get(img)
+                        image = Image.open(BytesIO(response.content)).convert('RGB')
+                    elif img.startswith('data:image/'):
+                        image = decode_data_image(img).convert('RGB')
+                    else:
+                        image = Image.open(img).convert('RGB')
+                elif isinstance(img, Image.Image):
+                    image = img.convert('RGB')
+                else:
+                    raise ValueError("Unsupported image format")
+                processed_inputs.append(image)
+            processed_inputs = self.preprocess(processed_inputs)
+            processed_inputs = processed_inputs.to(self.device)
+            embeddings = self.get_image_features(processed_inputs)
             if truncate_dim:
+                embeddings = self.truncate_embeddings(embeddings, truncate_dim)
             if normalize_embeddings:
+                embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
             if convert_to_numpy:
                 embeddings = embeddings.cpu()
             all_embeddings.extend(embeddings)
+        all_embeddings = [all_embeddings[idx] for idx in inverse_permutation]
         if convert_to_tensor:
             all_embeddings = torch.stack(all_embeddings)
         elif convert_to_numpy:
+            all_embeddings = np.asarray([emb.to(torch.float32).numpy() for emb in all_embeddings])
+        if input_was_single_img:
             all_embeddings = all_embeddings[0]
+        self.train(is_training)
         return all_embeddings
     def forward(

processing_clip.py CHANGED Viewed

@@ -72,6 +72,7 @@ class JinaCLIPImageProcessor(BaseImageProcessor):
         return output
     def preprocess(self, images: ImageInput, **kwargs) -> BatchFeature:
         _transform_needs_rebuild = False
         for k, v in kwargs.items():
             if k in self._valid_processor_keys:

         return output
     def preprocess(self, images: ImageInput, **kwargs) -> BatchFeature:
         _transform_needs_rebuild = False
         for k, v in kwargs.items():
             if k in self._valid_processor_keys:

rope_embeddings.py CHANGED Viewed

@@ -3,6 +3,7 @@
 # https://github.com/baaivision/EVA/tree/master/EVA-CLIP/rei/eva_clip
 # --------------------------------------------------------
 from math import pi
 import torch
@@ -74,8 +75,10 @@ class VisionRotaryEmbedding(nn.Module):
         freqs = broadcast((freqs_h[:, None, :], freqs_w[None, :, :]), dim=-1)
-        self.register_buffer('freqs_cos', freqs.cos(), persistent=False)
-        self.register_buffer('freqs_sin', freqs.sin(), persistent=False)
     def forward(self, t, start_index=0):
         rot_dim = self.freqs_cos.shape[-1]
@@ -134,8 +137,10 @@ class VisionRotaryEmbeddingFast(nn.Module):
         self.patch_dropout = patch_dropout
-        self.register_buffer('freqs_cos', freqs_cos, persistent=False)
-        self.register_buffer('freqs_sin', freqs_sin, persistent=False)
     def forward(self, t, patch_indices_keep=None):
         if patch_indices_keep is not None:

 # https://github.com/baaivision/EVA/tree/master/EVA-CLIP/rei/eva_clip
 # --------------------------------------------------------
+import logging
 from math import pi
 import torch
         freqs = broadcast((freqs_h[:, None, :], freqs_w[None, :, :]), dim=-1)
+        self.register_buffer('freqs_cos', freqs.cos())
+        self.register_buffer('freqs_sin', freqs.sin())
+        logging.info(f'Shape of rope freq: {self.freqs_cos.shape}')
     def forward(self, t, start_index=0):
         rot_dim = self.freqs_cos.shape[-1]
         self.patch_dropout = patch_dropout
+        self.register_buffer('freqs_cos', freqs_cos)
+        self.register_buffer('freqs_sin', freqs_sin)
+        logging.info(f'Shape of rope freq: {self.freqs_cos.shape}')
     def forward(self, t, patch_indices_keep=None):
         if patch_indices_keep is not None:

transform.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import random
 import warnings
 from dataclasses import asdict, dataclass
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 import torch
-import torchvision.transforms.functional as f
 from torchvision.transforms import (
     CenterCrop,
     ColorJitter,
@@ -22,93 +23,88 @@ OPENAI_DATASET_MEAN = tuple(OPENAI_CLIP_MEAN)
 OPENAI_DATASET_STD = tuple(OPENAI_CLIP_STD)
-def _setup_size(size, error_msg):
-    if isinstance(size, int):
-        return size, size
-    if isinstance(size, Sequence) and len(size) == 1:
-        return size[0], size[0]
-    if len(size) != 2:
-        raise ValueError(error_msg)
-    return size
-def _center_crop_or_pad(
-    img: torch.Tensor,
-    output_size: Union[int, Tuple[int, ...], List[int]],
-    fill: Union[int, Tuple[int]] = 0,
-) -> torch.Tensor:
-    """
-    Center crops and/or pads the given image. If the image is torch Tensor, it is
-    expected to have [..., H, W] shape, where ... means an arbitrary number of leading
-    dimensions. If image size is smaller than output size along any edge, image is
-    padded with 0 and then center cropped.
-    """
-    if isinstance(output_size, int):
-        output_size = (output_size, output_size)
-    elif isinstance(output_size, (tuple, list)) and len(output_size) == 1:
-        output_size = (output_size[0], output_size[0])
-    _, image_height, image_width = f.get_dimensions(img)
-    crop_height, crop_width = output_size
-    if crop_width > image_width or crop_height > image_height:
-        padding_ltrb = [
-            (crop_width - image_width) // 2 if crop_width > image_width else 0,
-            (crop_height - image_height) // 2 if crop_height > image_height else 0,
-            (crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
-            (crop_height - image_height + 1) // 2 if crop_height > image_height else 0,
-        ]
-        img = f.pad(img, padding_ltrb, fill=fill)
-        _, image_height, image_width = f.get_dimensions(img)
-        if crop_width == image_width and crop_height == image_height:
-            return img
-    crop_top = int(round((image_height - crop_height) / 2.0))
-    crop_left = int(round((image_width - crop_width) / 2.0))
-    return f.crop(img, crop_top, crop_left, crop_height, crop_width)
-class _CenterCropOrPad(torch.nn.Module):
-    """Crops the given image at the center.
-    If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading
-    dimensions. If image size is smaller than output size along any edge, image is
-    padded with 0 and then center cropped.
-    Args:
-        size (sequence or int): Desired output size of the crop. If size is an
-            int instead of sequence like (h, w), a square crop (size, size) is
-            made. If provided a sequence of length 1, it will be interpreted as
-            (size[0], size[0]).
     """
-    def __init__(self, size, fill=0):
-        super().__init__()
-        self.size = _setup_size(
-            size, error_msg='Please provide only two dimensions (h, w) for size.'
-        )
-        self.fill = fill
-    def forward(self, img):
-        """
-        Args:
-            img (PIL Image or Tensor): Image to be cropped.
-        Returns:
-            PIL Image or Tensor: Cropped image.
-        """
-        return _center_crop_or_pad(img, self.size, fill=self.fill)
-    def __repr__(self) -> str:
-        return f'{self.__class__.__name__}(size={self.size})'
-def _convert_to_rgb(image):
-    return image.convert('RGB')
-class _ResizeKeepRatio:
-    """Resize while keeping ratio. Copied from timm"""
     def __init__(
         self,
@@ -163,9 +159,8 @@ class _ResizeKeepRatio:
                 ratio_factor[0] / aspect_factor,
                 ratio_factor[1] * aspect_factor,
             )
-        return [
-            round(x * factor / ratio) for x, factor in zip(source_size, ratio_factor)
-        ]
     def __call__(self, img):
         """
@@ -185,7 +180,7 @@ class _ResizeKeepRatio:
             self.random_aspect_prob,
             self.random_aspect_range,
         )
-        img = f.resize(img, size, self.interpolation)
         return img
     def __repr__(self):
@@ -195,8 +190,92 @@ class _ResizeKeepRatio:
         return format_string
 class _ColorJitter(object):
-    """Apply color jitter to the PIL image with a specified probability"""
     def __init__(self, brightness=0.0, contrast=0.0, saturation=0.0, hue=0.0, p=0.8):
         assert 0.0 <= p <= 1.0
@@ -213,7 +292,9 @@ class _ColorJitter(object):
 class _GrayScale(object):
-    """Apply gray scale to the PIL image with a specified probability"""
     def __init__(self, p=0.2):
         assert 0.0 <= p <= 1.0
@@ -227,20 +308,6 @@ class _GrayScale(object):
             return img
-@dataclass
-class AugmentationCfg:
-    scale: Tuple[float, float] = (0.9, 1.0)
-    ratio: Optional[Tuple[float, float]] = None
-    color_jitter: Optional[
-        Union[float, Tuple[float, float, float], Tuple[float, float, float, float]]
-    ] = None
-    re_prob: Optional[float] = None
-    re_count: Optional[int] = None
-    use_timm: bool = False
-    color_jitter_prob: float = None
-    gray_scale_prob: float = None
 def image_transform(
     image_size: Union[int, Tuple[int, int]],
     is_train: bool,
@@ -340,10 +407,10 @@ def image_transform(
     else:
         if resize_mode == 'longest':
             transforms = [
-                _ResizeKeepRatio(
                     image_size, interpolation=interpolation_mode, longest=1
                 ),
-                _CenterCropOrPad(image_size, fill=fill_color),
             ]
         elif resize_mode == 'squash':
             if isinstance(image_size, int):
@@ -361,7 +428,7 @@ def image_transform(
                 transforms = [Resize(image_size[0], interpolation=interpolation_mode)]
             else:
                 # resize shortest edge to matching target dim for non-square target
-                transforms = [_ResizeKeepRatio(image_size)]
             transforms += [CenterCrop(image_size)]
         transforms.extend(
@@ -372,3 +439,20 @@ def image_transform(
             ]
         )
         return Compose(transforms)

+import numbers
 import random
 import warnings
 from dataclasses import asdict, dataclass
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
 import torch
+import torchvision.transforms.functional as F
 from torchvision.transforms import (
     CenterCrop,
     ColorJitter,
 OPENAI_DATASET_STD = tuple(OPENAI_CLIP_STD)
+@dataclass
+class PreprocessCfg:
+    size: Union[int, Tuple[int, int]] = 224
+    mode: str = 'RGB'
+    mean: Tuple[float, ...] = OPENAI_DATASET_MEAN
+    std: Tuple[float, ...] = OPENAI_DATASET_STD
+    interpolation: str = 'bicubic'
+    resize_mode: str = 'shortest'
+    fill_color: int = 0
+    def __post_init__(self):
+        assert self.mode in ('RGB',)
+    @property
+    def num_channels(self):
+        return 3
+    @property
+    def input_size(self):
+        return (self.num_channels,) + (self.size, self.size)
+_PREPROCESS_KEYS = set(asdict(PreprocessCfg()).keys())
+def merge_preprocess_dict(
+    base: Union[PreprocessCfg, Dict],
+    overlay: Dict,
+):
+    """Merge overlay key-value pairs on top of base preprocess cfg or dict.
+    Input dicts are filtered based on PreprocessCfg fields.
     """
+    if isinstance(base, PreprocessCfg):
+        base_clean = asdict(base)
+    else:
+        base_clean = {k: v for k, v in base.items() if k in _PREPROCESS_KEYS}
+    if overlay:
+        overlay_clean = {
+            k: v for k, v in overlay.items() if k in _PREPROCESS_KEYS and v is not None
+        }
+        base_clean.update(overlay_clean)
+    return base_clean
+def merge_preprocess_kwargs(base: Union[PreprocessCfg, Dict], **kwargs):
+    return merge_preprocess_dict(base, kwargs)
+@dataclass
+class AugmentationCfg:
+    scale: Tuple[float, float] = (0.9, 1.0)
+    ratio: Optional[Tuple[float, float]] = None
+    color_jitter: Optional[
+        Union[float, Tuple[float, float, float], Tuple[float, float, float, float]]
+    ] = None
+    re_prob: Optional[float] = None
+    re_count: Optional[int] = None
+    use_timm: bool = False
+    # params for simclr_jitter_gray
+    color_jitter_prob: float = None
+    gray_scale_prob: float = None
+def _setup_size(size, error_msg):
+    if isinstance(size, numbers.Number):
+        return int(size), int(size)
+    if isinstance(size, Sequence) and len(size) == 1:
+        return size[0], size[0]
+    if len(size) != 2:
+        raise ValueError(error_msg)
+    return size
+class ResizeKeepRatio:
+    """Resize and Keep Ratio
+    Copy & paste from `timm`
+    """
     def __init__(
         self,
                 ratio_factor[0] / aspect_factor,
                 ratio_factor[1] * aspect_factor,
             )
+        size = [round(x * f / ratio) for x, f in zip(source_size, ratio_factor)]
+        return size
     def __call__(self, img):
         """
             self.random_aspect_prob,
             self.random_aspect_range,
         )
+        img = F.resize(img, size, self.interpolation)
         return img
     def __repr__(self):
         return format_string
+def center_crop_or_pad(
+    img: torch.Tensor, output_size: List[int], fill=0
+) -> torch.Tensor:
+    """Center crops and/or pads the given image.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions. If image size is smaller than output size along any edge, image is
+    padded with 0 and then center cropped.
+    Args:
+        img (PIL Image or Tensor): Image to be cropped.
+        output_size (sequence or int): (height, width) of the crop box. If int or
+        sequence with single int, it is used for both directions.
+        fill (int, Tuple[int]): Padding color
+    Returns:
+        PIL Image or Tensor: Cropped image.
+    """
+    if isinstance(output_size, numbers.Number):
+        output_size = (int(output_size), int(output_size))
+    elif isinstance(output_size, (tuple, list)) and len(output_size) == 1:
+        output_size = (output_size[0], output_size[0])
+    _, image_height, image_width = F.get_dimensions(img)
+    crop_height, crop_width = output_size
+    if crop_width > image_width or crop_height > image_height:
+        padding_ltrb = [
+            (crop_width - image_width) // 2 if crop_width > image_width else 0,
+            (crop_height - image_height) // 2 if crop_height > image_height else 0,
+            (crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
+            (crop_height - image_height + 1) // 2 if crop_height > image_height else 0,
+        ]
+        img = F.pad(img, padding_ltrb, fill=fill)
+        _, image_height, image_width = F.get_dimensions(img)
+        if crop_width == image_width and crop_height == image_height:
+            return img
+    crop_top = int(round((image_height - crop_height) / 2.0))
+    crop_left = int(round((image_width - crop_width) / 2.0))
+    return F.crop(img, crop_top, crop_left, crop_height, crop_width)
+class CenterCropOrPad(torch.nn.Module):
+    """Crops the given image at the center.
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions. If image size is smaller than output size along any edge, image is
+    padded with 0 and then center cropped.
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as
+            (size[0], size[0]).
+    """
+    def __init__(self, size, fill=0):
+        super().__init__()
+        self.size = _setup_size(
+            size, error_msg='Please provide only two dimensions (h, w) for size.'
+        )
+        self.fill = fill
+    def forward(self, img):
+        """
+        Args:
+            img (PIL Image or Tensor): Image to be cropped.
+        Returns:
+            PIL Image or Tensor: Cropped image.
+        """
+        return center_crop_or_pad(img, self.size, fill=self.fill)
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}(size={self.size})'
+def _convert_to_rgb(image):
+    return image.convert('RGB')
 class _ColorJitter(object):
+    """
+    Apply Color Jitter to the PIL image with a specified probability.
+    """
     def __init__(self, brightness=0.0, contrast=0.0, saturation=0.0, hue=0.0, p=0.8):
         assert 0.0 <= p <= 1.0
 class _GrayScale(object):
+    """
+    Apply Gray Scale to the PIL image with a specified probability.
+    """
     def __init__(self, p=0.2):
         assert 0.0 <= p <= 1.0
             return img
 def image_transform(
     image_size: Union[int, Tuple[int, int]],
     is_train: bool,
     else:
         if resize_mode == 'longest':
             transforms = [
+                ResizeKeepRatio(
                     image_size, interpolation=interpolation_mode, longest=1
                 ),
+                CenterCropOrPad(image_size, fill=fill_color),
             ]
         elif resize_mode == 'squash':
             if isinstance(image_size, int):
                 transforms = [Resize(image_size[0], interpolation=interpolation_mode)]
             else:
                 # resize shortest edge to matching target dim for non-square target
+                transforms = [ResizeKeepRatio(image_size)]
             transforms += [CenterCrop(image_size)]
         transforms.extend(
             ]
         )
         return Compose(transforms)
+def image_transform_v2(
+    cfg: PreprocessCfg,
+    is_train: bool,
+    aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None,
+):
+    return image_transform(
+        image_size=cfg.size,
+        is_train=is_train,
+        mean=cfg.mean,
+        std=cfg.std,
+        interpolation=cfg.interpolation,
+        resize_mode=cfg.resize_mode,
+        fill_color=cfg.fill_color,
+        aug_cfg=aug_cfg,
+    )