update HF model

Files changed (6) hide show

.gitattributes +1 -0
README.md +60 -2
config.json +6 -158
configuration_clip.py +79 -78
pytorch_model.bin → model.safetensors +2 -2
modeling_clip.py +109 -60

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model.safetensors filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,5 +1,9 @@
 ---
 license: apache-2.0
 ---
 <div align="center">
@@ -29,6 +33,7 @@ In this paper, we propose LLM2CLIP, a novel approach that embraces the power of
 ## Usage
 ### Huggingface Version
 ```python
 from PIL import Image
 from transformers import AutoModel
@@ -37,9 +42,8 @@ import torch
 image_path = "CLIP.png"
 model_name_or_path = "LLM2CLIP-Openai-B-16" # or /path/to/local/LLM2CLIP-Openai-B-16
-image_size = 224
-processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch-16")
 model = AutoModel.from_pretrained(
     model_name_or_path,
     torch_dtype=torch.float16,
@@ -51,5 +55,59 @@ input_pixels = processor(images=image, return_tensors="pt").pixel_values.to('cud
 with torch.no_grad(), torch.cuda.amp.autocast():
     outputs = model.get_image_features(input_pixels)
 ```
 ## BibTeX & Citation

 ---
 license: apache-2.0
+tags:
+- CLIP
+- LLM2CLIP
+pipeline_tag: zero-shot-classification
 ---
 <div align="center">
 ## Usage
 ### Huggingface Version
+Image Embeddings
 ```python
 from PIL import Image
 from transformers import AutoModel
 image_path = "CLIP.png"
 model_name_or_path = "LLM2CLIP-Openai-B-16" # or /path/to/local/LLM2CLIP-Openai-B-16
+processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch16")
 model = AutoModel.from_pretrained(
     model_name_or_path,
     torch_dtype=torch.float16,
 with torch.no_grad(), torch.cuda.amp.autocast():
     outputs = model.get_image_features(input_pixels)
 ```
+Retrieval
+```python
+from PIL import Image
+from transformers import AutoModel, AutoConfig, AutoTokenizer
+from transformers import CLIPImageProcessor
+import torch
+from llm2vec import LLM2Vec
+processor = CLIPImageProcessor.from_pretrained("openai/openai/clip-vit-base-patch16")
+model_name_or_path = "microsoft/LLM2CLIP-Openai-B-16" # or /path/to/local/LLM2CLIP-Openai-B-16
+model = AutoModel.from_pretrained(
+    model_name_or_path,
+    torch_dtype=torch.float16,
+    trust_remote_code=True).to('cuda').eval()
+llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
+config = AutoConfig.from_pretrained(
+    llm_model_name, trust_remote_code=True
+)
+llm_model = AutoModel.from_pretrained(llm_model_name, config=config,trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
+llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' #  Workaround for LLM2VEC
+l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
+captions = ["a diagram", "a dog", "a cat"]
+image_path = "CLIP.png"
+image = Image.open(image_path)
+input_pixels = processor(images=image, return_tensors="pt").pixel_values.to('cuda')
+with torch.no_grad(), torch.cuda.amp.autocast():
+    image_features = model.get_image_features(input_pixels)
+    text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
+    text_features = model.get_text_features(text_features)
+    image_features /= image_features.norm(dim=-1, keepdim=True)
+    text_features /= text_features.norm(dim=-1, keepdim=True)
+    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
+print("Label probs:", text_probs)
+```
 ## BibTeX & Citation
+```
+@misc{huang2024llm2clippowerfullanguagemodel,
+      title={LLM2CLIP: Powerful Language Model Unlock Richer Visual Representation},
+      author={Weiquan Huang and Aoqi Wu and Yifan Yang and Xufang Luo and Yuqing Yang and Liang Hu and Qi Dai and Xiyang Dai and Dongdong Chen and Chong Luo and Lili Qiu},
+      year={2024},
+      eprint={2411.04997},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2411.04997},
+}

config.json CHANGED Viewed

@@ -1,179 +1,27 @@
 {
-  "_commit_hash": null,
-  "_name_or_path": "LLM2CLIP-Openai-L-14",
   "architectures": [
-    "CLIPModel"
   ],
   "auto_map": {
     "AutoConfig": "configuration_clip.CLIPConfig",
-    "AutoModel": "modeling_clip.CLIPModel"
   },
   "initializer_factor": 1.0,
   "logit_scale_init_value": 2.6592,
   "model_type": "clip",
   "projection_dim": 1280,
   "text_config": {
-    "_name_or_path": "",
-    "add_cross_attention": false,
-    "architectures": null,
-    "attention_dropout": 0.0,
-    "bad_words_ids": null,
-    "begin_suppress_tokens": null,
     "bos_token_id": 0,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "early_stopping": false,
-    "encoder_no_repeat_ngram_size": 0,
     "eos_token_id": 2,
-    "exponential_decay_length_penalty": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "hidden_act": "gelu",
-    "hidden_size": 512,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "initializer_factor": 1.0,
-    "initializer_range": 0.02,
-    "intermediate_size": 2048,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "k_bias": true,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "layer_norm_eps": 1e-05,
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "max_position_embeddings": 77,
-    "min_length": 0,
     "model_type": "clip_text_model",
-    "no_repeat_ngram_size": 0,
-    "num_attention_heads": 8,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_hidden_layers": 12,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": 1,
-    "post_layernorm": false,
-    "prefix": null,
-    "problem_type": null,
-    "projection_dim": 512,
-    "pruned_heads": {},
-    "q_bias": true,
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "sep_token_id": null,
-    "suppress_tokens": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": null,
-    "torchscript": false,
-    "transformers_version": "4.44.2",
-    "typical_p": 1.0,
-    "use_bfloat16": false,
-    "v_bias": true,
-    "vocab_size": 49408
   },
   "torch_dtype": "float32",
-  "transformers_version": null,
   "vision_config": {
-    "_name_or_path": "",
-    "add_cross_attention": false,
-    "architectures": null,
-    "attention_dropout": 0.0,
-    "bad_words_ids": null,
-    "begin_suppress_tokens": null,
-    "bos_token_id": null,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
     "dropout": 0.0,
-    "early_stopping": false,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": null,
-    "exponential_decay_length_penalty": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "hidden_act": "gelu",
-    "hidden_size": 768,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "image_size": 224,
-    "initializer_factor": 1.0,
-    "initializer_range": 0.02,
-    "intermediate_size": 3072,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "k_bias": true,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "layer_norm_eps": 1e-05,
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "min_length": 0,
     "model_type": "clip_vision_model",
-    "no_repeat_ngram_size": 0,
-    "num_attention_heads": 12,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_channels": 3,
-    "num_hidden_layers": 12,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": null,
-    "patch_size": 16,
-    "post_layernorm": false,
-    "prefix": null,
-    "problem_type": null,
-    "projection_dim": 512,
-    "pruned_heads": {},
-    "q_bias": true,
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "sep_token_id": null,
-    "suppress_tokens": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": null,
-    "torchscript": false,
-    "transformers_version": "4.44.2",
-    "typical_p": 1.0,
-    "use_bfloat16": false,
-    "v_bias": true
   }
 }

 {
   "architectures": [
+    "LLM2CLIPModel"
   ],
   "auto_map": {
     "AutoConfig": "configuration_clip.CLIPConfig",
+    "AutoModel": "modeling_clip.LLM2CLIPModel"
   },
   "initializer_factor": 1.0,
   "logit_scale_init_value": 2.6592,
   "model_type": "clip",
   "projection_dim": 1280,
   "text_config": {
     "bos_token_id": 0,
+    "dropout": 0.0,
     "eos_token_id": 2,
     "model_type": "clip_text_model",
+    "projection_dim": 1280
   },
   "torch_dtype": "float32",
+  "transformers_version": "4.40.2",
   "vision_config": {
     "dropout": 0.0,
     "model_type": "clip_vision_model",
+    "patch_size": 16
   }
 }

configuration_clip.py CHANGED Viewed

@@ -26,9 +26,9 @@ if TYPE_CHECKING:
     from transformers.utils import TensorType
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
@@ -50,25 +50,33 @@ class CLIPTextConfig(PretrainedConfig):
             Dimensionality of the encoder layers and the pooler layer.
         intermediate_size (`int`, *optional*, defaults to 2048):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
         num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer encoder.
-        max_position_embeddings (`int`, *optional*, defaults to 77):`
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
             The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float`, *optional*, defaults to 1):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
     Example:
@@ -84,7 +92,9 @@ class CLIPTextConfig(PretrainedConfig):
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
     model_type = "clip_text_model"
     def __init__(
         self,
@@ -95,18 +105,16 @@ class CLIPTextConfig(PretrainedConfig):
         num_hidden_layers=12,
         num_attention_heads=8,
         max_position_embeddings=77,
-        hidden_act="gelu",
         layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
-        q_bias=True,
-        k_bias=True,
-        v_bias=True,
-        post_layernorm=False,
         pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -122,28 +130,8 @@ class CLIPTextConfig(PretrainedConfig):
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
         self.initializer_factor = initializer_factor
-        self.q_bias=q_bias
-        self.k_bias=k_bias
-        self.v_bias=v_bias
-        self.post_layernorm = post_layernorm
         self.attention_dropout = attention_dropout
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-        # get the text config dict if we are loading from CLIPConfig
-        if config_dict.get("model_type") == "clip":
-            config_dict = config_dict["text_config"]
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-        return cls.from_dict(config_dict, **kwargs)
 class CLIPVisionConfig(PretrainedConfig):
     r"""
@@ -160,24 +148,28 @@ class CLIPVisionConfig(PretrainedConfig):
             Dimensionality of the encoder layers and the pooler layer.
         intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
         num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
         image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
         patch_size (`int`, *optional*, defaults to 32):
             The size (resolution) of each patch.
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
             The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float`, *optional*, defaults to 1):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
@@ -197,6 +189,7 @@ class CLIPVisionConfig(PretrainedConfig):
     ```"""
     model_type = "clip_vision_model"
     def __init__(
         self,
@@ -208,15 +201,11 @@ class CLIPVisionConfig(PretrainedConfig):
         num_channels=3,
         image_size=224,
         patch_size=32,
-        hidden_act="gelu",
         layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
-        q_bias=True,
-        k_bias=True,
-        v_bias=True,
-        post_layernorm=False,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -231,30 +220,10 @@ class CLIPVisionConfig(PretrainedConfig):
         self.image_size = image_size
         self.initializer_range = initializer_range
         self.initializer_factor = initializer_factor
-        self.q_bias=q_bias
-        self.k_bias=k_bias
-        self.v_bias=v_bias
-        self.post_layernorm = post_layernorm
         self.attention_dropout = attention_dropout
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-        # get the vision config dict if we are loading from CLIPConfig
-        if config_dict.get("model_type") == "clip":
-            config_dict = config_dict["vision_config"]
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-        return cls.from_dict(config_dict, **kwargs)
 class CLIPConfig(PretrainedConfig):
     r"""
@@ -272,9 +241,9 @@ class CLIPConfig(PretrainedConfig):
         vision_config (`dict`, *optional*):
             Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
         projection_dim (`int`, *optional*, defaults to 512):
-            Dimentionality of text and vision projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
-            The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
         kwargs (*optional*):
             Dictionary of keyword arguments.
@@ -303,7 +272,7 @@ class CLIPConfig(PretrainedConfig):
     ```"""
     model_type = "clip"
-    is_composition = True
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
@@ -339,9 +308,9 @@ class CLIPConfig(PretrainedConfig):
                     else:
                         message = (
                             f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
-                            f'value `text_config["{key}"]` will be overriden.'
                         )
-                    logger.warning(message)
             # Update all values in `text_config` with the ones in `_text_config_dict`.
             text_config.update(_text_config_dict)
@@ -371,9 +340,9 @@ class CLIPConfig(PretrainedConfig):
                     else:
                         message = (
                             f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
-                            f'The value `vision_config["{key}"]` will be overriden.'
                         )
-                    logger.warning(message)
             # Update all values in `vision_config` with the ones in `_vision_config_dict`.
             vision_config.update(_vision_config_dict)
@@ -405,16 +374,48 @@ class CLIPConfig(PretrainedConfig):
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["text_config"] = self.text_config.to_dict()
-        output["vision_config"] = self.vision_config.to_dict()
-        output["model_type"] = self.__class__.model_type
-        return output

     from transformers.utils import TensorType
 from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
             Dimensionality of the encoder layers and the pooler layer.
         intermediate_size (`int`, *optional*, defaults to 2048):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and vision projection layers.
         num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
             `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
+        pad_token_id (`int`, *optional*, defaults to 1):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 49406):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 49407):
+            End of stream token id.
     Example:
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
     model_type = "clip_text_model"
+    base_config_key = "text_config"
     def __init__(
         self,
         num_hidden_layers=12,
         num_attention_heads=8,
         max_position_embeddings=77,
+        hidden_act="quick_gelu",
         layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/clip
+        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
         pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
         self.initializer_factor = initializer_factor
         self.attention_dropout = attention_dropout
 class CLIPVisionConfig(PretrainedConfig):
     r"""
             Dimensionality of the encoder layers and the pooler layer.
         intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and vision projection layers.
         num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
         image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
         patch_size (`int`, *optional*, defaults to 32):
             The size (resolution) of each patch.
         hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
     ```"""
     model_type = "clip_vision_model"
+    base_config_key = "vision_config"
     def __init__(
         self,
         num_channels=3,
         image_size=224,
         patch_size=32,
+        hidden_act="quick_gelu",
         layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.image_size = image_size
         self.initializer_range = initializer_range
         self.initializer_factor = initializer_factor
         self.attention_dropout = attention_dropout
         self.layer_norm_eps = layer_norm_eps
         self.hidden_act = hidden_act
 class CLIPConfig(PretrainedConfig):
     r"""
         vision_config (`dict`, *optional*):
             Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
         projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and vision projection layers.
         logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The initial value of the *logit_scale* parameter. Default is used as per the original CLIP implementation.
         kwargs (*optional*):
             Dictionary of keyword arguments.
     ```"""
     model_type = "clip"
+    sub_configs = {"text_config": CLIPTextConfig, "vision_config": CLIPVisionConfig}
     def __init__(
         self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
                     else:
                         message = (
                             f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
+                            f'value `text_config["{key}"]` will be overridden.'
                         )
+                    logger.info(message)
             # Update all values in `text_config` with the ones in `_text_config_dict`.
             text_config.update(_text_config_dict)
                     else:
                         message = (
                             f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
+                            f'The value `vision_config["{key}"]` will be overridden.'
                         )
+                    logger.info(message)
             # Update all values in `vision_config` with the ones in `_vision_config_dict`.
             vision_config.update(_vision_config_dict)
         return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+class CLIPOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+            ]
+        )
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("logits_per_image", {0: "batch"}),
+                ("logits_per_text", {0: "batch"}),
+                ("text_embeds", {0: "batch"}),
+                ("image_embeds", {0: "batch"}),
+            ]
+        )
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+    def generate_dummy_inputs(
+        self,
+        processor: "ProcessorMixin",
+        batch_size: int = -1,
+        seq_length: int = -1,
+        framework: Optional["TensorType"] = None,
+    ) -> Mapping[str, Any]:
+        text_input_dict = super().generate_dummy_inputs(
+            processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
+        )
+        image_input_dict = super().generate_dummy_inputs(
+            processor.image_processor, batch_size=batch_size, framework=framework
+        )
+        return {**text_input_dict, **image_input_dict}
+    @property
+    def default_onnx_opset(self) -> int:
+        return 14

pytorch_model.bin → model.safetensors RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:38781b3a1f206115e135e24f48158f678e41d8600848876db931ff08d55150be
-size 347196398

 version https://git-lfs.github.com/spec/v1
+oid sha256:0fd872fd6bf16bfba5624e8f13c14168f5b25496fc25246c04556bc858dd9a6d
+size 1442236212

modeling_clip.py CHANGED Viewed

@@ -37,9 +37,9 @@ from transformers.utils import (
     logging,
     replace_return_docstrings,
 )
 from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
 if is_flash_attn_2_available():
     from transformers.modeling_flash_attention_utils import _flash_attention_forward
@@ -603,16 +603,15 @@ class CLIPPreTrainedModel(PreTrainedModel):
             fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
             nn.init.normal_(module.fc1.weight, std=fc_std)
             nn.init.normal_(module.fc2.weight, std=in_proj_std)
-        elif isinstance(module, CLIPModel):
-            pass
             # nn.init.normal_(
             #     module.text_projection.weight,
             #     std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
             # )
-            # nn.init.normal_(
-            #     module.visual_projection.weight,
-            #     std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
-            # )
         elif isinstance(module, CLIPVisionModelWithProjection):
             nn.init.normal_(
                 module.visual_projection.weight,
@@ -1112,80 +1111,97 @@ class CLIPVisionModel(CLIPPreTrainedModel):
 @add_start_docstrings(CLIP_START_DOCSTRING)
-class CLIPModel(CLIPPreTrainedModel):
     config_class = CLIPConfig
     _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer", "CLIPVisionEmbeddings"]
     def __init__(self, config: CLIPConfig):
         super().__init__(config)
         if not isinstance(config.vision_config, CLIPVisionConfig):
             raise TypeError(
                 "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
                 f" {type(config.vision_config)}."
             )
         vision_config = config.vision_config
         self.projection_dim = config.projection_dim
         self.vision_embed_dim = vision_config.hidden_size
         vision_model = CLIPVisionModel._from_config(vision_config, attn_implementation=config._attn_implementation)
         self.vision_model = vision_model.vision_model
-        # self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
-        scale = self.vision_embed_dim ** -0.5
-        self.visual_projection =  nn.Parameter(scale * torch.randn(self.vision_embed_dim, self.projection_dim))
         self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
         # Initialize weights and apply final processing
         self.post_init()
-    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
-    def get_text_features(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> torch.FloatTensor:
-        r"""
-        Returns:
-            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
-            applying the projection layer to the pooled output of [`CLIPTextModel`].
-        Examples:
-        ```python
-        >>> from transformers import AutoTokenizer, CLIPModel
-        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
-        >>> text_features = model.get_text_features(**inputs)
-        ```"""
-        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        text_outputs = self.text_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        pooled_output = text_outputs[1]
-        text_features = self.text_projection(pooled_output)
-        return text_features
     @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
     def get_image_features(
@@ -1232,7 +1248,7 @@ class CLIPModel(CLIPPreTrainedModel):
         )
         pooled_output = vision_outputs[1]  # pooled_output
-        image_features = pooled_output @ self.visual_projection
         return image_features
@@ -1413,7 +1429,40 @@ class CLIPTextModelWithProjection(CLIPPreTrainedModel):
             attentions=text_outputs.attentions,
         )
 @add_start_docstrings(
     """
     CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output).

     logging,
     replace_return_docstrings,
 )
+# from configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
 from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
 if is_flash_attn_2_available():
     from transformers.modeling_flash_attention_utils import _flash_attention_forward
             fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
             nn.init.normal_(module.fc1.weight, std=fc_std)
             nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        elif isinstance(module, LLM2CLIPModel):
             # nn.init.normal_(
             #     module.text_projection.weight,
             #     std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
             # )
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+            )
         elif isinstance(module, CLIPVisionModelWithProjection):
             nn.init.normal_(
                 module.visual_projection.weight,
 @add_start_docstrings(CLIP_START_DOCSTRING)
+class LLM2CLIPModel(CLIPPreTrainedModel):
     config_class = CLIPConfig
     _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer", "CLIPVisionEmbeddings"]
     def __init__(self, config: CLIPConfig):
         super().__init__(config)
+        # if not isinstance(config.text_config, CLIPTextConfig):
+        #     raise TypeError(
+        #         "config.text_config is expected to be of type CLIPTextConfig but is of type"
+        #         f" {type(config.text_config)}."
+        #     )
         if not isinstance(config.vision_config, CLIPVisionConfig):
             raise TypeError(
                 "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
                 f" {type(config.vision_config)}."
             )
+        # text_config = config.text_config
         vision_config = config.vision_config
         self.projection_dim = config.projection_dim
+        # self.text_embed_dim = text_config.hidden_size
         self.vision_embed_dim = vision_config.hidden_size
+        adapter = LLM2CLIP_Adapter()
+        self.text_adapter = adapter
+        # text_model = CLIPTextModel._from_config(text_config, attn_implementation=config._attn_implementation)
+        # self.text_model = text_model.text_model
         vision_model = CLIPVisionModel._from_config(vision_config, attn_implementation=config._attn_implementation)
         self.vision_model = vision_model.vision_model
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        # self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
         self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
         # Initialize weights and apply final processing
         self.post_init()
+    def get_text_features(self, inputs):
+        #TODO: make this more flexible and configurable
+        return self.text_adapter(inputs)
+    # @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    # def get_text_features(
+    #     self,
+    #     input_ids: Optional[torch.Tensor] = None,
+    #     attention_mask: Optional[torch.Tensor] = None,
+    #     position_ids: Optional[torch.Tensor] = None,
+    #     output_attentions: Optional[bool] = None,
+    #     output_hidden_states: Optional[bool] = None,
+    #     return_dict: Optional[bool] = None,
+    # ) -> torch.FloatTensor:
+    #     r"""
+    #     Returns:
+    #         text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+    #         applying the projection layer to the pooled output of [`CLIPTextModel`].
+    #     Examples:
+    #     ```python
+    #     >>> from transformers import AutoTokenizer, CLIPModel
+    #     >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+    #     >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+    #     >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+    #     >>> text_features = model.get_text_features(**inputs)
+    #     ```"""
+    #     # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+    #     output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    #     output_hidden_states = (
+    #         output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    #     )
+    #     return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+    #     text_outputs = self.text_model(
+    #         input_ids=input_ids,
+    #         attention_mask=attention_mask,
+    #         position_ids=position_ids,
+    #         output_attentions=output_attentions,
+    #         output_hidden_states=output_hidden_states,
+    #         return_dict=return_dict,
+    #     )
+    #     pooled_output = text_outputs[1]
+    #     text_features = self.text_projection(pooled_output)
+    #     return text_features
     @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
     def get_image_features(
         )
         pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
         return image_features
             attentions=text_outputs.attentions,
         )
+class LinearBlock(nn.Module):
+    def __init__(self, dim, expansion_factor=4, dropout=0.,norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.fn = nn.Sequential(
+            nn.Linear(dim, int(expansion_factor * dim)),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(int(expansion_factor * dim), dim),
+        )
+        self.ln = norm_layer(dim)
+    def forward(self, x):
+        return x + self.fn(self.ln(x))
+class LLM2CLIP_Adapter(nn.Module):
+    def __init__(self):
+        super().__init__()
+        #TODO: make this more flexible and configurable
+        # hard-coded values from the  LLM2CLIP model
+        text_embedding_dim = 4096
+        expansion_factor = 2
+        adaptor_num_layers = 4
+        proj_bias = True
+        output_dim = 1280
+        self.adaptor = nn.Sequential(
+            *[LinearBlock(text_embedding_dim, expansion_factor) for _ in range(adaptor_num_layers)],
+            nn.LayerNorm(text_embedding_dim),
+            nn.Linear(text_embedding_dim, output_dim, bias=proj_bias),
+        )
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = torch.nn.functional.normalize(hidden_states, p=2, dim=1)
+        hidden_states = self.adaptor(hidden_states)
+        return hidden_states
 @add_start_docstrings(
     """
     CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output).