Gengzigang commited on
Commit
9dd61e3
1 Parent(s): 0a00000

update HF model

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,5 +1,9 @@
1
  ---
2
  license: apache-2.0
 
 
 
 
3
  ---
4
  <div align="center">
5
 
@@ -29,6 +33,7 @@ In this paper, we propose LLM2CLIP, a novel approach that embraces the power of
29
  ## Usage
30
 
31
  ### Huggingface Version
 
32
  ```python
33
  from PIL import Image
34
  from transformers import AutoModel
@@ -37,9 +42,8 @@ import torch
37
 
38
  image_path = "CLIP.png"
39
  model_name_or_path = "LLM2CLIP-Openai-B-16" # or /path/to/local/LLM2CLIP-Openai-B-16
40
- image_size = 224
41
 
42
- processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch-16")
43
  model = AutoModel.from_pretrained(
44
  model_name_or_path,
45
  torch_dtype=torch.float16,
@@ -51,5 +55,59 @@ input_pixels = processor(images=image, return_tensors="pt").pixel_values.to('cud
51
  with torch.no_grad(), torch.cuda.amp.autocast():
52
  outputs = model.get_image_features(input_pixels)
53
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  ## BibTeX & Citation
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
+ tags:
4
+ - CLIP
5
+ - LLM2CLIP
6
+ pipeline_tag: zero-shot-classification
7
  ---
8
  <div align="center">
9
 
 
33
  ## Usage
34
 
35
  ### Huggingface Version
36
+ Image Embeddings
37
  ```python
38
  from PIL import Image
39
  from transformers import AutoModel
 
42
 
43
  image_path = "CLIP.png"
44
  model_name_or_path = "LLM2CLIP-Openai-B-16" # or /path/to/local/LLM2CLIP-Openai-B-16
 
45
 
46
+ processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch16")
47
  model = AutoModel.from_pretrained(
48
  model_name_or_path,
49
  torch_dtype=torch.float16,
 
55
  with torch.no_grad(), torch.cuda.amp.autocast():
56
  outputs = model.get_image_features(input_pixels)
57
  ```
58
+ Retrieval
59
+ ```python
60
+ from PIL import Image
61
+ from transformers import AutoModel, AutoConfig, AutoTokenizer
62
+ from transformers import CLIPImageProcessor
63
+ import torch
64
+ from llm2vec import LLM2Vec
65
+
66
+ processor = CLIPImageProcessor.from_pretrained("openai/openai/clip-vit-base-patch16")
67
+ model_name_or_path = "microsoft/LLM2CLIP-Openai-B-16" # or /path/to/local/LLM2CLIP-Openai-B-16
68
+ model = AutoModel.from_pretrained(
69
+ model_name_or_path,
70
+ torch_dtype=torch.float16,
71
+ trust_remote_code=True).to('cuda').eval()
72
+
73
+ llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
74
+ config = AutoConfig.from_pretrained(
75
+ llm_model_name, trust_remote_code=True
76
+ )
77
+ llm_model = AutoModel.from_pretrained(llm_model_name, config=config,trust_remote_code=True)
78
+ tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
79
+ llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
80
+ l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
81
+
82
+ captions = ["a diagram", "a dog", "a cat"]
83
+ image_path = "CLIP.png"
84
+
85
+ image = Image.open(image_path)
86
+ input_pixels = processor(images=image, return_tensors="pt").pixel_values.to('cuda')
87
+
88
+ with torch.no_grad(), torch.cuda.amp.autocast():
89
+ image_features = model.get_image_features(input_pixels)
90
+ text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
91
+ text_features = model.get_text_features(text_features)
92
+
93
+ image_features /= image_features.norm(dim=-1, keepdim=True)
94
+ text_features /= text_features.norm(dim=-1, keepdim=True)
95
+
96
+ text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
97
+
98
+ print("Label probs:", text_probs)
99
+
100
+ ```
101
 
102
  ## BibTeX & Citation
103
+
104
+ ```
105
+ @misc{huang2024llm2clippowerfullanguagemodel,
106
+ title={LLM2CLIP: Powerful Language Model Unlock Richer Visual Representation},
107
+ author={Weiquan Huang and Aoqi Wu and Yifan Yang and Xufang Luo and Yuqing Yang and Liang Hu and Qi Dai and Xiyang Dai and Dongdong Chen and Chong Luo and Lili Qiu},
108
+ year={2024},
109
+ eprint={2411.04997},
110
+ archivePrefix={arXiv},
111
+ primaryClass={cs.CV},
112
+ url={https://arxiv.org/abs/2411.04997},
113
+ }
config.json CHANGED
@@ -1,179 +1,27 @@
1
  {
2
- "_commit_hash": null,
3
- "_name_or_path": "LLM2CLIP-Openai-L-14",
4
  "architectures": [
5
- "CLIPModel"
6
  ],
7
  "auto_map": {
8
  "AutoConfig": "configuration_clip.CLIPConfig",
9
- "AutoModel": "modeling_clip.CLIPModel"
10
  },
11
  "initializer_factor": 1.0,
12
  "logit_scale_init_value": 2.6592,
13
  "model_type": "clip",
14
  "projection_dim": 1280,
15
  "text_config": {
16
- "_name_or_path": "",
17
- "add_cross_attention": false,
18
- "architectures": null,
19
- "attention_dropout": 0.0,
20
- "bad_words_ids": null,
21
- "begin_suppress_tokens": null,
22
  "bos_token_id": 0,
23
- "chunk_size_feed_forward": 0,
24
- "cross_attention_hidden_size": null,
25
- "decoder_start_token_id": null,
26
- "diversity_penalty": 0.0,
27
- "do_sample": false,
28
- "early_stopping": false,
29
- "encoder_no_repeat_ngram_size": 0,
30
  "eos_token_id": 2,
31
- "exponential_decay_length_penalty": null,
32
- "finetuning_task": null,
33
- "forced_bos_token_id": null,
34
- "forced_eos_token_id": null,
35
- "hidden_act": "gelu",
36
- "hidden_size": 512,
37
- "id2label": {
38
- "0": "LABEL_0",
39
- "1": "LABEL_1"
40
- },
41
- "initializer_factor": 1.0,
42
- "initializer_range": 0.02,
43
- "intermediate_size": 2048,
44
- "is_decoder": false,
45
- "is_encoder_decoder": false,
46
- "k_bias": true,
47
- "label2id": {
48
- "LABEL_0": 0,
49
- "LABEL_1": 1
50
- },
51
- "layer_norm_eps": 1e-05,
52
- "length_penalty": 1.0,
53
- "max_length": 20,
54
- "max_position_embeddings": 77,
55
- "min_length": 0,
56
  "model_type": "clip_text_model",
57
- "no_repeat_ngram_size": 0,
58
- "num_attention_heads": 8,
59
- "num_beam_groups": 1,
60
- "num_beams": 1,
61
- "num_hidden_layers": 12,
62
- "num_return_sequences": 1,
63
- "output_attentions": false,
64
- "output_hidden_states": false,
65
- "output_scores": false,
66
- "pad_token_id": 1,
67
- "post_layernorm": false,
68
- "prefix": null,
69
- "problem_type": null,
70
- "projection_dim": 512,
71
- "pruned_heads": {},
72
- "q_bias": true,
73
- "remove_invalid_values": false,
74
- "repetition_penalty": 1.0,
75
- "return_dict": true,
76
- "return_dict_in_generate": false,
77
- "sep_token_id": null,
78
- "suppress_tokens": null,
79
- "task_specific_params": null,
80
- "temperature": 1.0,
81
- "tf_legacy_loss": false,
82
- "tie_encoder_decoder": false,
83
- "tie_word_embeddings": true,
84
- "tokenizer_class": null,
85
- "top_k": 50,
86
- "top_p": 1.0,
87
- "torch_dtype": null,
88
- "torchscript": false,
89
- "transformers_version": "4.44.2",
90
- "typical_p": 1.0,
91
- "use_bfloat16": false,
92
- "v_bias": true,
93
- "vocab_size": 49408
94
  },
95
  "torch_dtype": "float32",
96
- "transformers_version": null,
97
  "vision_config": {
98
- "_name_or_path": "",
99
- "add_cross_attention": false,
100
- "architectures": null,
101
- "attention_dropout": 0.0,
102
- "bad_words_ids": null,
103
- "begin_suppress_tokens": null,
104
- "bos_token_id": null,
105
- "chunk_size_feed_forward": 0,
106
- "cross_attention_hidden_size": null,
107
- "decoder_start_token_id": null,
108
- "diversity_penalty": 0.0,
109
- "do_sample": false,
110
  "dropout": 0.0,
111
- "early_stopping": false,
112
- "encoder_no_repeat_ngram_size": 0,
113
- "eos_token_id": null,
114
- "exponential_decay_length_penalty": null,
115
- "finetuning_task": null,
116
- "forced_bos_token_id": null,
117
- "forced_eos_token_id": null,
118
- "hidden_act": "gelu",
119
- "hidden_size": 768,
120
- "id2label": {
121
- "0": "LABEL_0",
122
- "1": "LABEL_1"
123
- },
124
- "image_size": 224,
125
- "initializer_factor": 1.0,
126
- "initializer_range": 0.02,
127
- "intermediate_size": 3072,
128
- "is_decoder": false,
129
- "is_encoder_decoder": false,
130
- "k_bias": true,
131
- "label2id": {
132
- "LABEL_0": 0,
133
- "LABEL_1": 1
134
- },
135
- "layer_norm_eps": 1e-05,
136
- "length_penalty": 1.0,
137
- "max_length": 20,
138
- "min_length": 0,
139
  "model_type": "clip_vision_model",
140
- "no_repeat_ngram_size": 0,
141
- "num_attention_heads": 12,
142
- "num_beam_groups": 1,
143
- "num_beams": 1,
144
- "num_channels": 3,
145
- "num_hidden_layers": 12,
146
- "num_return_sequences": 1,
147
- "output_attentions": false,
148
- "output_hidden_states": false,
149
- "output_scores": false,
150
- "pad_token_id": null,
151
- "patch_size": 16,
152
- "post_layernorm": false,
153
- "prefix": null,
154
- "problem_type": null,
155
- "projection_dim": 512,
156
- "pruned_heads": {},
157
- "q_bias": true,
158
- "remove_invalid_values": false,
159
- "repetition_penalty": 1.0,
160
- "return_dict": true,
161
- "return_dict_in_generate": false,
162
- "sep_token_id": null,
163
- "suppress_tokens": null,
164
- "task_specific_params": null,
165
- "temperature": 1.0,
166
- "tf_legacy_loss": false,
167
- "tie_encoder_decoder": false,
168
- "tie_word_embeddings": true,
169
- "tokenizer_class": null,
170
- "top_k": 50,
171
- "top_p": 1.0,
172
- "torch_dtype": null,
173
- "torchscript": false,
174
- "transformers_version": "4.44.2",
175
- "typical_p": 1.0,
176
- "use_bfloat16": false,
177
- "v_bias": true
178
  }
179
  }
 
1
  {
 
 
2
  "architectures": [
3
+ "LLM2CLIPModel"
4
  ],
5
  "auto_map": {
6
  "AutoConfig": "configuration_clip.CLIPConfig",
7
+ "AutoModel": "modeling_clip.LLM2CLIPModel"
8
  },
9
  "initializer_factor": 1.0,
10
  "logit_scale_init_value": 2.6592,
11
  "model_type": "clip",
12
  "projection_dim": 1280,
13
  "text_config": {
 
 
 
 
 
 
14
  "bos_token_id": 0,
15
+ "dropout": 0.0,
 
 
 
 
 
 
16
  "eos_token_id": 2,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  "model_type": "clip_text_model",
18
+ "projection_dim": 1280
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  },
20
  "torch_dtype": "float32",
21
+ "transformers_version": "4.40.2",
22
  "vision_config": {
 
 
 
 
 
 
 
 
 
 
 
 
23
  "dropout": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  "model_type": "clip_vision_model",
25
+ "patch_size": 16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  }
27
  }
configuration_clip.py CHANGED
@@ -26,9 +26,9 @@ if TYPE_CHECKING:
26
  from transformers.utils import TensorType
27
 
28
  from transformers.configuration_utils import PretrainedConfig
 
29
  from transformers.utils import logging
30
 
31
-
32
  logger = logging.get_logger(__name__)
33
 
34
 
@@ -50,25 +50,33 @@ class CLIPTextConfig(PretrainedConfig):
50
  Dimensionality of the encoder layers and the pooler layer.
51
  intermediate_size (`int`, *optional*, defaults to 2048):
52
  Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
 
 
53
  num_hidden_layers (`int`, *optional*, defaults to 12):
54
  Number of hidden layers in the Transformer encoder.
55
  num_attention_heads (`int`, *optional*, defaults to 8):
56
  Number of attention heads for each attention layer in the Transformer encoder.
57
- max_position_embeddings (`int`, *optional*, defaults to 77):`
58
  The maximum sequence length that this model might ever be used with. Typically set this to something large
59
  just in case (e.g., 512 or 1024 or 2048).
60
  hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
61
  The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
62
  `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
63
- layer_norm_eps (`float`, *optional*, defaults to 1e-5):
64
  The epsilon used by the layer normalization layers.
65
  attention_dropout (`float`, *optional*, defaults to 0.0):
66
  The dropout ratio for the attention probabilities.
67
  initializer_range (`float`, *optional*, defaults to 0.02):
68
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
69
- initializer_factor (`float`, *optional*, defaults to 1):
70
  A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
71
  testing).
 
 
 
 
 
 
72
 
73
  Example:
74
 
@@ -84,7 +92,9 @@ class CLIPTextConfig(PretrainedConfig):
84
  >>> # Accessing the model configuration
85
  >>> configuration = model.config
86
  ```"""
 
87
  model_type = "clip_text_model"
 
88
 
89
  def __init__(
90
  self,
@@ -95,18 +105,16 @@ class CLIPTextConfig(PretrainedConfig):
95
  num_hidden_layers=12,
96
  num_attention_heads=8,
97
  max_position_embeddings=77,
98
- hidden_act="gelu",
99
  layer_norm_eps=1e-5,
100
  attention_dropout=0.0,
101
  initializer_range=0.02,
102
  initializer_factor=1.0,
103
- q_bias=True,
104
- k_bias=True,
105
- v_bias=True,
106
- post_layernorm=False,
107
  pad_token_id=1,
108
- bos_token_id=0,
109
- eos_token_id=2,
110
  **kwargs,
111
  ):
112
  super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -122,28 +130,8 @@ class CLIPTextConfig(PretrainedConfig):
122
  self.hidden_act = hidden_act
123
  self.initializer_range = initializer_range
124
  self.initializer_factor = initializer_factor
125
- self.q_bias=q_bias
126
- self.k_bias=k_bias
127
- self.v_bias=v_bias
128
- self.post_layernorm = post_layernorm
129
  self.attention_dropout = attention_dropout
130
 
131
- @classmethod
132
- def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
133
- config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
134
-
135
- # get the text config dict if we are loading from CLIPConfig
136
- if config_dict.get("model_type") == "clip":
137
- config_dict = config_dict["text_config"]
138
-
139
- if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
140
- logger.warning(
141
- f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
142
- f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
143
- )
144
-
145
- return cls.from_dict(config_dict, **kwargs)
146
-
147
 
148
  class CLIPVisionConfig(PretrainedConfig):
149
  r"""
@@ -160,24 +148,28 @@ class CLIPVisionConfig(PretrainedConfig):
160
  Dimensionality of the encoder layers and the pooler layer.
161
  intermediate_size (`int`, *optional*, defaults to 3072):
162
  Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
 
 
163
  num_hidden_layers (`int`, *optional*, defaults to 12):
164
  Number of hidden layers in the Transformer encoder.
165
  num_attention_heads (`int`, *optional*, defaults to 12):
166
  Number of attention heads for each attention layer in the Transformer encoder.
 
 
167
  image_size (`int`, *optional*, defaults to 224):
168
  The size (resolution) of each image.
169
  patch_size (`int`, *optional*, defaults to 32):
170
  The size (resolution) of each patch.
171
  hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
172
  The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
173
- `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
174
- layer_norm_eps (`float`, *optional*, defaults to 1e-5):
175
  The epsilon used by the layer normalization layers.
176
  attention_dropout (`float`, *optional*, defaults to 0.0):
177
  The dropout ratio for the attention probabilities.
178
  initializer_range (`float`, *optional*, defaults to 0.02):
179
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
180
- initializer_factor (`float`, *optional*, defaults to 1):
181
  A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
182
  testing).
183
 
@@ -197,6 +189,7 @@ class CLIPVisionConfig(PretrainedConfig):
197
  ```"""
198
 
199
  model_type = "clip_vision_model"
 
200
 
201
  def __init__(
202
  self,
@@ -208,15 +201,11 @@ class CLIPVisionConfig(PretrainedConfig):
208
  num_channels=3,
209
  image_size=224,
210
  patch_size=32,
211
- hidden_act="gelu",
212
  layer_norm_eps=1e-5,
213
  attention_dropout=0.0,
214
  initializer_range=0.02,
215
  initializer_factor=1.0,
216
- q_bias=True,
217
- k_bias=True,
218
- v_bias=True,
219
- post_layernorm=False,
220
  **kwargs,
221
  ):
222
  super().__init__(**kwargs)
@@ -231,30 +220,10 @@ class CLIPVisionConfig(PretrainedConfig):
231
  self.image_size = image_size
232
  self.initializer_range = initializer_range
233
  self.initializer_factor = initializer_factor
234
- self.q_bias=q_bias
235
- self.k_bias=k_bias
236
- self.v_bias=v_bias
237
- self.post_layernorm = post_layernorm
238
  self.attention_dropout = attention_dropout
239
  self.layer_norm_eps = layer_norm_eps
240
  self.hidden_act = hidden_act
241
 
242
- @classmethod
243
- def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
244
- config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
245
-
246
- # get the vision config dict if we are loading from CLIPConfig
247
- if config_dict.get("model_type") == "clip":
248
- config_dict = config_dict["vision_config"]
249
-
250
- if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
251
- logger.warning(
252
- f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
253
- f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
254
- )
255
-
256
- return cls.from_dict(config_dict, **kwargs)
257
-
258
 
259
  class CLIPConfig(PretrainedConfig):
260
  r"""
@@ -272,9 +241,9 @@ class CLIPConfig(PretrainedConfig):
272
  vision_config (`dict`, *optional*):
273
  Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
274
  projection_dim (`int`, *optional*, defaults to 512):
275
- Dimentionality of text and vision projection layers.
276
  logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
277
- The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
278
  kwargs (*optional*):
279
  Dictionary of keyword arguments.
280
 
@@ -303,7 +272,7 @@ class CLIPConfig(PretrainedConfig):
303
  ```"""
304
 
305
  model_type = "clip"
306
- is_composition = True
307
 
308
  def __init__(
309
  self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
@@ -339,9 +308,9 @@ class CLIPConfig(PretrainedConfig):
339
  else:
340
  message = (
341
  f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
342
- f'value `text_config["{key}"]` will be overriden.'
343
  )
344
- logger.warning(message)
345
 
346
  # Update all values in `text_config` with the ones in `_text_config_dict`.
347
  text_config.update(_text_config_dict)
@@ -371,9 +340,9 @@ class CLIPConfig(PretrainedConfig):
371
  else:
372
  message = (
373
  f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
374
- f'The value `vision_config["{key}"]` will be overriden.'
375
  )
376
- logger.warning(message)
377
 
378
  # Update all values in `vision_config` with the ones in `_vision_config_dict`.
379
  vision_config.update(_vision_config_dict)
@@ -405,16 +374,48 @@ class CLIPConfig(PretrainedConfig):
405
 
406
  return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
407
 
408
- def to_dict(self):
409
- """
410
- Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
411
-
412
- Returns:
413
- `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
414
- """
415
- output = copy.deepcopy(self.__dict__)
416
- output["text_config"] = self.text_config.to_dict()
417
- output["vision_config"] = self.vision_config.to_dict()
418
- output["model_type"] = self.__class__.model_type
419
- return output
420
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  from transformers.utils import TensorType
27
 
28
  from transformers.configuration_utils import PretrainedConfig
29
+ from transformers.onnx import OnnxConfig
30
  from transformers.utils import logging
31
 
 
32
  logger = logging.get_logger(__name__)
33
 
34
 
 
50
  Dimensionality of the encoder layers and the pooler layer.
51
  intermediate_size (`int`, *optional*, defaults to 2048):
52
  Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
53
+ projection_dim (`int`, *optional*, defaults to 512):
54
+ Dimensionality of text and vision projection layers.
55
  num_hidden_layers (`int`, *optional*, defaults to 12):
56
  Number of hidden layers in the Transformer encoder.
57
  num_attention_heads (`int`, *optional*, defaults to 8):
58
  Number of attention heads for each attention layer in the Transformer encoder.
59
+ max_position_embeddings (`int`, *optional*, defaults to 77):
60
  The maximum sequence length that this model might ever be used with. Typically set this to something large
61
  just in case (e.g., 512 or 1024 or 2048).
62
  hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
63
  The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
64
  `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
65
+ layer_norm_eps (`float`, *optional*, defaults to 1e-05):
66
  The epsilon used by the layer normalization layers.
67
  attention_dropout (`float`, *optional*, defaults to 0.0):
68
  The dropout ratio for the attention probabilities.
69
  initializer_range (`float`, *optional*, defaults to 0.02):
70
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
71
+ initializer_factor (`float`, *optional*, defaults to 1.0):
72
  A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
73
  testing).
74
+ pad_token_id (`int`, *optional*, defaults to 1):
75
+ Padding token id.
76
+ bos_token_id (`int`, *optional*, defaults to 49406):
77
+ Beginning of stream token id.
78
+ eos_token_id (`int`, *optional*, defaults to 49407):
79
+ End of stream token id.
80
 
81
  Example:
82
 
 
92
  >>> # Accessing the model configuration
93
  >>> configuration = model.config
94
  ```"""
95
+
96
  model_type = "clip_text_model"
97
+ base_config_key = "text_config"
98
 
99
  def __init__(
100
  self,
 
105
  num_hidden_layers=12,
106
  num_attention_heads=8,
107
  max_position_embeddings=77,
108
+ hidden_act="quick_gelu",
109
  layer_norm_eps=1e-5,
110
  attention_dropout=0.0,
111
  initializer_range=0.02,
112
  initializer_factor=1.0,
113
+ # This differs from `CLIPTokenizer`'s default and from openai/clip
114
+ # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
 
 
115
  pad_token_id=1,
116
+ bos_token_id=49406,
117
+ eos_token_id=49407,
118
  **kwargs,
119
  ):
120
  super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
130
  self.hidden_act = hidden_act
131
  self.initializer_range = initializer_range
132
  self.initializer_factor = initializer_factor
 
 
 
 
133
  self.attention_dropout = attention_dropout
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
  class CLIPVisionConfig(PretrainedConfig):
137
  r"""
 
148
  Dimensionality of the encoder layers and the pooler layer.
149
  intermediate_size (`int`, *optional*, defaults to 3072):
150
  Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
151
+ projection_dim (`int`, *optional*, defaults to 512):
152
+ Dimensionality of text and vision projection layers.
153
  num_hidden_layers (`int`, *optional*, defaults to 12):
154
  Number of hidden layers in the Transformer encoder.
155
  num_attention_heads (`int`, *optional*, defaults to 12):
156
  Number of attention heads for each attention layer in the Transformer encoder.
157
+ num_channels (`int`, *optional*, defaults to 3):
158
+ The number of input channels.
159
  image_size (`int`, *optional*, defaults to 224):
160
  The size (resolution) of each image.
161
  patch_size (`int`, *optional*, defaults to 32):
162
  The size (resolution) of each patch.
163
  hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
164
  The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
165
+ `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
166
+ layer_norm_eps (`float`, *optional*, defaults to 1e-05):
167
  The epsilon used by the layer normalization layers.
168
  attention_dropout (`float`, *optional*, defaults to 0.0):
169
  The dropout ratio for the attention probabilities.
170
  initializer_range (`float`, *optional*, defaults to 0.02):
171
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
172
+ initializer_factor (`float`, *optional*, defaults to 1.0):
173
  A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
174
  testing).
175
 
 
189
  ```"""
190
 
191
  model_type = "clip_vision_model"
192
+ base_config_key = "vision_config"
193
 
194
  def __init__(
195
  self,
 
201
  num_channels=3,
202
  image_size=224,
203
  patch_size=32,
204
+ hidden_act="quick_gelu",
205
  layer_norm_eps=1e-5,
206
  attention_dropout=0.0,
207
  initializer_range=0.02,
208
  initializer_factor=1.0,
 
 
 
 
209
  **kwargs,
210
  ):
211
  super().__init__(**kwargs)
 
220
  self.image_size = image_size
221
  self.initializer_range = initializer_range
222
  self.initializer_factor = initializer_factor
 
 
 
 
223
  self.attention_dropout = attention_dropout
224
  self.layer_norm_eps = layer_norm_eps
225
  self.hidden_act = hidden_act
226
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
  class CLIPConfig(PretrainedConfig):
229
  r"""
 
241
  vision_config (`dict`, *optional*):
242
  Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
243
  projection_dim (`int`, *optional*, defaults to 512):
244
+ Dimensionality of text and vision projection layers.
245
  logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
246
+ The initial value of the *logit_scale* parameter. Default is used as per the original CLIP implementation.
247
  kwargs (*optional*):
248
  Dictionary of keyword arguments.
249
 
 
272
  ```"""
273
 
274
  model_type = "clip"
275
+ sub_configs = {"text_config": CLIPTextConfig, "vision_config": CLIPVisionConfig}
276
 
277
  def __init__(
278
  self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
 
308
  else:
309
  message = (
310
  f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
311
+ f'value `text_config["{key}"]` will be overridden.'
312
  )
313
+ logger.info(message)
314
 
315
  # Update all values in `text_config` with the ones in `_text_config_dict`.
316
  text_config.update(_text_config_dict)
 
340
  else:
341
  message = (
342
  f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
343
+ f'The value `vision_config["{key}"]` will be overridden.'
344
  )
345
+ logger.info(message)
346
 
347
  # Update all values in `vision_config` with the ones in `_vision_config_dict`.
348
  vision_config.update(_vision_config_dict)
 
374
 
375
  return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
376
 
 
 
 
 
 
 
 
 
 
 
 
 
377
 
378
+ class CLIPOnnxConfig(OnnxConfig):
379
+ @property
380
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
381
+ return OrderedDict(
382
+ [
383
+ ("input_ids", {0: "batch", 1: "sequence"}),
384
+ ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
385
+ ("attention_mask", {0: "batch", 1: "sequence"}),
386
+ ]
387
+ )
388
+
389
+ @property
390
+ def outputs(self) -> Mapping[str, Mapping[int, str]]:
391
+ return OrderedDict(
392
+ [
393
+ ("logits_per_image", {0: "batch"}),
394
+ ("logits_per_text", {0: "batch"}),
395
+ ("text_embeds", {0: "batch"}),
396
+ ("image_embeds", {0: "batch"}),
397
+ ]
398
+ )
399
+
400
+ @property
401
+ def atol_for_validation(self) -> float:
402
+ return 1e-4
403
+
404
+ def generate_dummy_inputs(
405
+ self,
406
+ processor: "ProcessorMixin",
407
+ batch_size: int = -1,
408
+ seq_length: int = -1,
409
+ framework: Optional["TensorType"] = None,
410
+ ) -> Mapping[str, Any]:
411
+ text_input_dict = super().generate_dummy_inputs(
412
+ processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
413
+ )
414
+ image_input_dict = super().generate_dummy_inputs(
415
+ processor.image_processor, batch_size=batch_size, framework=framework
416
+ )
417
+ return {**text_input_dict, **image_input_dict}
418
+
419
+ @property
420
+ def default_onnx_opset(self) -> int:
421
+ return 14
pytorch_model.bin → model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38781b3a1f206115e135e24f48158f678e41d8600848876db931ff08d55150be
3
- size 347196398
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fd872fd6bf16bfba5624e8f13c14168f5b25496fc25246c04556bc858dd9a6d
3
+ size 1442236212
modeling_clip.py CHANGED
@@ -37,9 +37,9 @@ from transformers.utils import (
37
  logging,
38
  replace_return_docstrings,
39
  )
 
40
  from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
41
 
42
-
43
  if is_flash_attn_2_available():
44
  from transformers.modeling_flash_attention_utils import _flash_attention_forward
45
 
@@ -603,16 +603,15 @@ class CLIPPreTrainedModel(PreTrainedModel):
603
  fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
604
  nn.init.normal_(module.fc1.weight, std=fc_std)
605
  nn.init.normal_(module.fc2.weight, std=in_proj_std)
606
- elif isinstance(module, CLIPModel):
607
- pass
608
  # nn.init.normal_(
609
  # module.text_projection.weight,
610
  # std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
611
  # )
612
- # nn.init.normal_(
613
- # module.visual_projection.weight,
614
- # std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
615
- # )
616
  elif isinstance(module, CLIPVisionModelWithProjection):
617
  nn.init.normal_(
618
  module.visual_projection.weight,
@@ -1112,80 +1111,97 @@ class CLIPVisionModel(CLIPPreTrainedModel):
1112
 
1113
 
1114
  @add_start_docstrings(CLIP_START_DOCSTRING)
1115
- class CLIPModel(CLIPPreTrainedModel):
1116
  config_class = CLIPConfig
1117
  _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer", "CLIPVisionEmbeddings"]
1118
 
1119
  def __init__(self, config: CLIPConfig):
1120
  super().__init__(config)
 
 
 
 
 
 
1121
  if not isinstance(config.vision_config, CLIPVisionConfig):
1122
  raise TypeError(
1123
  "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
1124
  f" {type(config.vision_config)}."
1125
  )
1126
 
 
1127
  vision_config = config.vision_config
1128
 
1129
  self.projection_dim = config.projection_dim
 
1130
  self.vision_embed_dim = vision_config.hidden_size
 
 
 
 
 
 
1131
 
1132
  vision_model = CLIPVisionModel._from_config(vision_config, attn_implementation=config._attn_implementation)
1133
  self.vision_model = vision_model.vision_model
1134
 
1135
- # self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
1136
- scale = self.vision_embed_dim ** -0.5
1137
- self.visual_projection = nn.Parameter(scale * torch.randn(self.vision_embed_dim, self.projection_dim))
1138
  self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
1139
 
1140
  # Initialize weights and apply final processing
1141
  self.post_init()
1142
-
1143
- @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
1144
- def get_text_features(
1145
- self,
1146
- input_ids: Optional[torch.Tensor] = None,
1147
- attention_mask: Optional[torch.Tensor] = None,
1148
- position_ids: Optional[torch.Tensor] = None,
1149
- output_attentions: Optional[bool] = None,
1150
- output_hidden_states: Optional[bool] = None,
1151
- return_dict: Optional[bool] = None,
1152
- ) -> torch.FloatTensor:
1153
- r"""
1154
- Returns:
1155
- text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
1156
- applying the projection layer to the pooled output of [`CLIPTextModel`].
1157
-
1158
- Examples:
1159
-
1160
- ```python
1161
- >>> from transformers import AutoTokenizer, CLIPModel
1162
-
1163
- >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
1164
- >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
1165
-
1166
- >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
1167
- >>> text_features = model.get_text_features(**inputs)
1168
- ```"""
1169
- # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
1170
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1171
- output_hidden_states = (
1172
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1173
- )
1174
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1175
-
1176
- text_outputs = self.text_model(
1177
- input_ids=input_ids,
1178
- attention_mask=attention_mask,
1179
- position_ids=position_ids,
1180
- output_attentions=output_attentions,
1181
- output_hidden_states=output_hidden_states,
1182
- return_dict=return_dict,
1183
- )
1184
-
1185
- pooled_output = text_outputs[1]
1186
- text_features = self.text_projection(pooled_output)
1187
-
1188
- return text_features
 
 
 
 
1189
 
1190
  @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
1191
  def get_image_features(
@@ -1232,7 +1248,7 @@ class CLIPModel(CLIPPreTrainedModel):
1232
  )
1233
 
1234
  pooled_output = vision_outputs[1] # pooled_output
1235
- image_features = pooled_output @ self.visual_projection
1236
 
1237
  return image_features
1238
 
@@ -1413,7 +1429,40 @@ class CLIPTextModelWithProjection(CLIPPreTrainedModel):
1413
  attentions=text_outputs.attentions,
1414
  )
1415
 
1416
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1417
  @add_start_docstrings(
1418
  """
1419
  CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output).
 
37
  logging,
38
  replace_return_docstrings,
39
  )
40
+ # from configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
41
  from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
42
 
 
43
  if is_flash_attn_2_available():
44
  from transformers.modeling_flash_attention_utils import _flash_attention_forward
45
 
 
603
  fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
604
  nn.init.normal_(module.fc1.weight, std=fc_std)
605
  nn.init.normal_(module.fc2.weight, std=in_proj_std)
606
+ elif isinstance(module, LLM2CLIPModel):
 
607
  # nn.init.normal_(
608
  # module.text_projection.weight,
609
  # std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
610
  # )
611
+ nn.init.normal_(
612
+ module.visual_projection.weight,
613
+ std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
614
+ )
615
  elif isinstance(module, CLIPVisionModelWithProjection):
616
  nn.init.normal_(
617
  module.visual_projection.weight,
 
1111
 
1112
 
1113
  @add_start_docstrings(CLIP_START_DOCSTRING)
1114
+ class LLM2CLIPModel(CLIPPreTrainedModel):
1115
  config_class = CLIPConfig
1116
  _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer", "CLIPVisionEmbeddings"]
1117
 
1118
  def __init__(self, config: CLIPConfig):
1119
  super().__init__(config)
1120
+ # if not isinstance(config.text_config, CLIPTextConfig):
1121
+ # raise TypeError(
1122
+ # "config.text_config is expected to be of type CLIPTextConfig but is of type"
1123
+ # f" {type(config.text_config)}."
1124
+ # )
1125
+
1126
  if not isinstance(config.vision_config, CLIPVisionConfig):
1127
  raise TypeError(
1128
  "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
1129
  f" {type(config.vision_config)}."
1130
  )
1131
 
1132
+ # text_config = config.text_config
1133
  vision_config = config.vision_config
1134
 
1135
  self.projection_dim = config.projection_dim
1136
+ # self.text_embed_dim = text_config.hidden_size
1137
  self.vision_embed_dim = vision_config.hidden_size
1138
+
1139
+ adapter = LLM2CLIP_Adapter()
1140
+ self.text_adapter = adapter
1141
+
1142
+ # text_model = CLIPTextModel._from_config(text_config, attn_implementation=config._attn_implementation)
1143
+ # self.text_model = text_model.text_model
1144
 
1145
  vision_model = CLIPVisionModel._from_config(vision_config, attn_implementation=config._attn_implementation)
1146
  self.vision_model = vision_model.vision_model
1147
 
1148
+ self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
1149
+ # self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
 
1150
  self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
1151
 
1152
  # Initialize weights and apply final processing
1153
  self.post_init()
1154
+
1155
+ def get_text_features(self, inputs):
1156
+ #TODO: make this more flexible and configurable
1157
+ return self.text_adapter(inputs)
1158
+
1159
+ # @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
1160
+ # def get_text_features(
1161
+ # self,
1162
+ # input_ids: Optional[torch.Tensor] = None,
1163
+ # attention_mask: Optional[torch.Tensor] = None,
1164
+ # position_ids: Optional[torch.Tensor] = None,
1165
+ # output_attentions: Optional[bool] = None,
1166
+ # output_hidden_states: Optional[bool] = None,
1167
+ # return_dict: Optional[bool] = None,
1168
+ # ) -> torch.FloatTensor:
1169
+ # r"""
1170
+ # Returns:
1171
+ # text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
1172
+ # applying the projection layer to the pooled output of [`CLIPTextModel`].
1173
+
1174
+ # Examples:
1175
+
1176
+ # ```python
1177
+ # >>> from transformers import AutoTokenizer, CLIPModel
1178
+
1179
+ # >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
1180
+ # >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
1181
+
1182
+ # >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
1183
+ # >>> text_features = model.get_text_features(**inputs)
1184
+ # ```"""
1185
+ # # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
1186
+ # output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1187
+ # output_hidden_states = (
1188
+ # output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1189
+ # )
1190
+ # return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1191
+
1192
+ # text_outputs = self.text_model(
1193
+ # input_ids=input_ids,
1194
+ # attention_mask=attention_mask,
1195
+ # position_ids=position_ids,
1196
+ # output_attentions=output_attentions,
1197
+ # output_hidden_states=output_hidden_states,
1198
+ # return_dict=return_dict,
1199
+ # )
1200
+
1201
+ # pooled_output = text_outputs[1]
1202
+ # text_features = self.text_projection(pooled_output)
1203
+
1204
+ # return text_features
1205
 
1206
  @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
1207
  def get_image_features(
 
1248
  )
1249
 
1250
  pooled_output = vision_outputs[1] # pooled_output
1251
+ image_features = self.visual_projection(pooled_output)
1252
 
1253
  return image_features
1254
 
 
1429
  attentions=text_outputs.attentions,
1430
  )
1431
 
1432
+ class LinearBlock(nn.Module):
1433
+ def __init__(self, dim, expansion_factor=4, dropout=0.,norm_layer=nn.LayerNorm):
1434
+ super().__init__()
1435
+ self.fn = nn.Sequential(
1436
+ nn.Linear(dim, int(expansion_factor * dim)),
1437
+ nn.GELU(),
1438
+ nn.Dropout(dropout),
1439
+ nn.Linear(int(expansion_factor * dim), dim),
1440
+ )
1441
+ self.ln = norm_layer(dim)
1442
+
1443
+ def forward(self, x):
1444
+ return x + self.fn(self.ln(x))
1445
+
1446
+ class LLM2CLIP_Adapter(nn.Module):
1447
+ def __init__(self):
1448
+ super().__init__()
1449
+ #TODO: make this more flexible and configurable
1450
+ # hard-coded values from the LLM2CLIP model
1451
+ text_embedding_dim = 4096
1452
+ expansion_factor = 2
1453
+ adaptor_num_layers = 4
1454
+ proj_bias = True
1455
+ output_dim = 1280
1456
+ self.adaptor = nn.Sequential(
1457
+ *[LinearBlock(text_embedding_dim, expansion_factor) for _ in range(adaptor_num_layers)],
1458
+ nn.LayerNorm(text_embedding_dim),
1459
+ nn.Linear(text_embedding_dim, output_dim, bias=proj_bias),
1460
+ )
1461
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
1462
+ hidden_states = torch.nn.functional.normalize(hidden_states, p=2, dim=1)
1463
+ hidden_states = self.adaptor(hidden_states)
1464
+ return hidden_states
1465
+
1466
  @add_start_docstrings(
1467
  """
1468
  CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output).