nlpcver commited on
Commit
ceb28a7
·
1 Parent(s): 5328e40

Add application file

Browse files
.gitignore ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ **/__pycache__
4
+ *.py[cod]
5
+ *$py.class
6
+
7
+ # C extensions
8
+ *.so
9
+
10
+ # Distribution / packaging
11
+ .Python
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # ours
31
+ tools/
app.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from typing import List
3
+ from PIL import Image
4
+ from zhclip import ZhCLIPProcessor, ZhCLIPModel # From https://www.github.com/thu-ml/zh-clip
5
+
6
+ version = 'thu-ml/zh-clip-vit-roberta-large-patch14'
7
+ model = ZhCLIPModel.from_pretrained(version)
8
+ processor = ZhCLIPProcessor.from_pretrained(version)
9
+
10
+ def inference(image, texts: List[str]):
11
+ texts = [x[0] for x in texts]
12
+ inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
13
+ outputs = model(**inputs)
14
+ image_features = outputs.image_features
15
+ text_features = outputs.text_features
16
+ text_probs = (image_features @ text_features.T).softmax(dim=-1)[0].detach().cpu().numpy()
17
+ return {i: float(text_probs[i]) for i in range(len(text_probs))}
18
+
19
+ title = "ZH-CLIP zero-shot classification"
20
+ description = "Chinese Clip Model (ZH-CLIP) zero-shot classification"
21
+ article="<p style='text-align: center'><a href='https://www.github.com/thu-ml/zh-clip' target='_blank'>github: zh-clip</a> <a href='https://huggingface.co/thu-ml/zh-clip-vit-roberta-large-patch14' target='_blank'>huggingface model: thu-ml/zh-clip-vit-roberta-large-patch14</a></p>"
22
+ examples = [['./images/dog.jpeg', [['一只狗'], ['一只猫']]]]
23
+ interpretation='default'
24
+ enable_queue=True
25
+
26
+ iface = gr.Interface(fn=inference, inputs=["image", "list"], outputs="label",
27
+ title=title, description=description, article=article, examples=examples,
28
+ enable_queue=enable_queue)
29
+ iface.launch(server_name='0.0.0.0')
images/dog.jpeg ADDED
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch
2
+ transformers==4.26.1
3
+ multilingual_clip
zhclip/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from .modeling_zhclip import (
2
+ ZhCLIPModel,
3
+ )
4
+
5
+ from .configuration_zhclip import ZhCLIPConfig
6
+ from .processing_zhclip import ZhCLIPProcessor
zhclip/configuration_zhclip.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ ZhClip model configuration"""
16
+
17
+ import copy
18
+
19
+ from transformers.configuration_utils import PretrainedConfig
20
+ from transformers.utils import logging
21
+ from transformers.models.auto.configuration_auto import AutoConfig
22
+ from transformers.models.clip.configuration_clip import CLIPVisionConfig
23
+ from typing import Union, Dict
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+ class ZhCLIPConfig(PretrainedConfig):
28
+
29
+ model_type = "zhclip"
30
+ is_composition = True
31
+
32
+ def __init__(
33
+ self,
34
+ text_config: Union[PretrainedConfig, Dict],
35
+ vision_config: Union[PretrainedConfig, Dict],
36
+ num_token_types=2,
37
+ hidden_size=768,
38
+ num_hidden_layers=6,
39
+ num_attention_heads=12,
40
+ intermediate_size=3072,
41
+ hidden_act="gelu",
42
+ hidden_dropout_prob=0.1,
43
+ attention_probs_dropout_prob=0.1,
44
+ initializer_range=0.02,
45
+ layer_norm_eps=1e-12,
46
+ classifier_dropout=None,
47
+ **kwargs):
48
+ super().__init__(**kwargs)
49
+
50
+ if not isinstance(text_config, PretrainedConfig):
51
+ text_model_type = text_config.pop('model_type')
52
+ text_config = AutoConfig.for_model(text_model_type, **text_config)
53
+ self.text_config = text_config
54
+
55
+ if not isinstance(vision_config, PretrainedConfig):
56
+ vision_model_type = vision_config.pop('model_type')
57
+ if vision_model_type == "clip":
58
+ vision_config = AutoConfig.for_model(vision_model_type, **vision_config).vision_config
59
+ elif vision_model_type == "clip_vision_model":
60
+ vision_config = CLIPVisionConfig(**vision_config)
61
+ else:
62
+ vision_config = AutoConfig.for_model(vision_model_type, **vision_config)
63
+ self.vision_config = vision_config
64
+ else:
65
+ vision_model_type = vision_config.model_type
66
+ if vision_model_type== "clip":
67
+ vision_config = vision_config.vision_config
68
+ self.vision_config = vision_config
69
+
70
+
71
+ # co-attention
72
+ self.num_token_types=num_token_types
73
+ self.hidden_size=hidden_size
74
+ self.num_hidden_layers=num_hidden_layers
75
+ self.num_attention_heads=num_attention_heads
76
+ self.intermediate_size=intermediate_size
77
+ self.hidden_act=hidden_act
78
+ self.hidden_dropout_prob=hidden_dropout_prob
79
+ self.attention_probs_dropout_prob=attention_probs_dropout_prob
80
+ self.initializer_range=initializer_range
81
+ self.layer_norm_eps=layer_norm_eps
82
+ self.classifier_dropout=classifier_dropout
83
+
84
+
85
+ def to_dict(self):
86
+ """
87
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
88
+ Returns:
89
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
90
+ """
91
+ output = copy.deepcopy(self.__dict__)
92
+ output["vision_config"] = self.vision_config.to_dict()
93
+ output["text_config"] = self.text_config.to_dict()
94
+ output["model_type"] = self.__class__.model_type
95
+ return output
zhclip/modeling_zhclip.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ PyTorch ZH-CLIP model."""
16
+
17
+
18
+ from typing import Optional, Tuple, Union
19
+ from torch import TensorType
20
+
21
+ import torch
22
+ from torch import nn
23
+
24
+
25
+ from transformers.modeling_utils import PreTrainedModel
26
+ from transformers.utils import logging, ModelOutput
27
+ from transformers.models.auto.modeling_auto import AutoModel
28
+
29
+ from transformers.models.clip.modeling_clip import CLIPVisionConfig, CLIPVisionModel
30
+ from .configuration_zhclip import ZhCLIPConfig
31
+ from dataclasses import dataclass
32
+
33
+ logger = logging.get_logger(__name__)
34
+ _CONFIG_FOR_DOC = "ZhCLIPConfig"
35
+
36
+ @dataclass
37
+ class ZhCLIPModelOutput(ModelOutput):
38
+
39
+ text_features: torch.FloatTensor = None
40
+ image_features: torch.FloatTensor = None
41
+
42
+
43
+ class MeanPooler(nn.Module):
44
+ """Mean pooling"""
45
+
46
+ def forward(self, last_hidden_state: TensorType, attention_mask: TensorType):
47
+ masked_output = last_hidden_state * attention_mask.unsqueeze(-1)
48
+ return masked_output.sum(dim=1) / attention_mask.sum(-1, keepdim=True)
49
+
50
+
51
+ class ZhCLIPPreTrainedModel(PreTrainedModel):
52
+ """
53
+ An abstract class to handle weights initialization.
54
+ """
55
+
56
+ config_class = ZhCLIPConfig
57
+ base_model_prefix = "zhclip"
58
+ supports_gradient_checkpointing = False
59
+ _keys_to_ignore_on_load_missing = [r"position_ids"]
60
+
61
+ def _init_weights(self, module):
62
+ """Initialize the weights"""
63
+ if isinstance(module, nn.Linear):
64
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
65
+ if module.bias is not None:
66
+ module.bias.data.zero_()
67
+ elif isinstance(module, nn.Embedding):
68
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
69
+ if module.padding_idx is not None:
70
+ module.weight.data[module.padding_idx].zero_()
71
+ elif isinstance(module, nn.LayerNorm):
72
+ module.bias.data.zero_()
73
+ module.weight.data.fill_(1.0)
74
+
75
+
76
+ class ZhCLIPModel(ZhCLIPPreTrainedModel):
77
+ def __init__(
78
+ self,
79
+ config: Optional[ZhCLIPConfig] = None,
80
+ vision_model: Optional[PreTrainedModel] = None,
81
+ text_model: Optional[PreTrainedModel] = None,
82
+ ):
83
+
84
+ if config is None and (vision_model is None or text_model is None):
85
+ raise ValueError("Either a configuration or an vision and a text model has to be provided")
86
+
87
+ if config is None:
88
+ config = ZhCLIPConfig(vision_model.config, text_model.config)
89
+ else:
90
+ if not isinstance(config, self.config_class):
91
+ raise ValueError(f"config: {config} has to be of type {self.config_class}")
92
+
93
+ # initialize with config
94
+ super().__init__(config)
95
+
96
+ if vision_model is None:
97
+ if isinstance(config.vision_config, CLIPVisionConfig):
98
+ vision_model = CLIPVisionModel(config.vision_config).vision_model
99
+ else:
100
+ vision_model = AutoModel.from_config(config.vision_config)
101
+
102
+ if text_model is None:
103
+ text_model = AutoModel.from_config(config.text_config)
104
+
105
+ self.vision_model = vision_model
106
+ self.text_model = text_model
107
+
108
+ # make sure that the individual model's config refers to the shared config
109
+ # so that the updates to the config will be synced
110
+ self.vision_model.config = self.config.vision_config
111
+ self.text_model.config = self.config.text_config
112
+
113
+ self.vision_embed_dim = config.vision_config.hidden_size
114
+ self.text_embed_dim = config.text_config.hidden_size
115
+ self.coattention_dim = config.hidden_size
116
+
117
+ # add projection layers
118
+ mlp_hidden_size = (self.text_embed_dim + self.coattention_dim) // 2
119
+ self.text_projection = nn.Sequential(
120
+ nn.Linear(self.text_embed_dim, mlp_hidden_size, bias=False),
121
+ nn.GELU(),
122
+ nn.Linear(mlp_hidden_size, self.coattention_dim, bias=False),
123
+ )
124
+ self.text_pooler = MeanPooler()
125
+ self.visual_projection = nn.Linear(self.vision_embed_dim, self.coattention_dim)
126
+
127
+
128
+ def forward(
129
+ self,
130
+ input_ids: Optional[torch.LongTensor] = None,
131
+ pixel_values: Optional[torch.FloatTensor] = None,
132
+ attention_mask: Optional[torch.Tensor] = None,
133
+ position_ids: Optional[torch.LongTensor] = None,
134
+ token_type_ids: Optional[torch.LongTensor] = None,
135
+ patch_ids = None,
136
+ extend_token_type_ids = None,
137
+ return_loss: Optional[bool] = None,
138
+ return_dict: Optional[bool] = None,
139
+ ) -> Union[Tuple[torch.Tensor], ZhCLIPModelOutput]:
140
+
141
+ return_dict = return_dict if return_dict is not None else self.config.return_dict
142
+ image_features = self.get_image_features(
143
+ pixel_values=pixel_values,
144
+ return_dict=return_dict,
145
+ )
146
+ text_features = self.get_text_features(
147
+ input_ids=input_ids,
148
+ attention_mask=attention_mask,
149
+ token_type_ids=token_type_ids,
150
+ position_ids=position_ids,
151
+ return_dict=return_dict,
152
+ )
153
+ return ZhCLIPModelOutput(
154
+ image_features = image_features,
155
+ text_features = text_features,
156
+ )
157
+
158
+
159
+ @classmethod
160
+ def from_pretrained(cls, *args, **kwargs):
161
+ # At the moment fast initialization is not supported
162
+ # for composite models
163
+ kwargs["_fast_init"] = False
164
+ return super().from_pretrained(*args, **kwargs)
165
+
166
+
167
+ def get_text_features(
168
+ self,
169
+ input_ids=None,
170
+ attention_mask=None,
171
+ position_ids=None,
172
+ token_type_ids=None,
173
+ output_attentions=None,
174
+ output_hidden_states=None,
175
+ return_dict=None,
176
+ ):
177
+ text_outputs = self.text_model(
178
+ input_ids=input_ids,
179
+ attention_mask=attention_mask,
180
+ position_ids=position_ids,
181
+ token_type_ids=token_type_ids,
182
+ #output_attentions=output_attentions,
183
+ #output_hidden_states=output_hidden_states,
184
+ return_dict=return_dict,
185
+ )
186
+ if attention_mask is None:
187
+ attention_mask = (input_ids != self.config.pad_token_id).long()
188
+ text_pool = self.text_pooler(text_outputs[0], attention_mask)
189
+ text_feat = self.text_projection(text_pool)
190
+ return text_feat
191
+
192
+
193
+ def get_image_features(
194
+ self,
195
+ pixel_values: Optional[torch.FloatTensor] = None,
196
+ output_attentions: Optional[bool] = None,
197
+ output_hidden_states: Optional[bool] = None,
198
+ return_dict: Optional[bool] = None,
199
+ ) -> torch.FloatTensor:
200
+ r"""
201
+ Returns:
202
+ image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
203
+ applying the projection layer to the pooled output of [`CLIPVisionModel`].
204
+
205
+ Examples:
206
+
207
+ ```python
208
+ >>> from PIL import Image
209
+ >>> import requests
210
+ >>> from transformers import AutoProcessor, CLIPModel
211
+
212
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
213
+ >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
214
+
215
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
216
+ >>> image = Image.open(requests.get(url, stream=True).raw)
217
+
218
+ >>> inputs = processor(images=image, return_tensors="pt")
219
+
220
+ >>> image_features = model.get_image_features(**inputs)
221
+ ```"""
222
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
223
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
224
+ output_hidden_states = (
225
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
226
+ )
227
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
228
+
229
+ vision_outputs = self.vision_model(
230
+ pixel_values=pixel_values,
231
+ output_attentions=output_attentions,
232
+ output_hidden_states=output_hidden_states,
233
+ return_dict=return_dict,
234
+ )
235
+
236
+ pooled_output = vision_outputs[1] # pooled_output
237
+ image_features = self.visual_projection(pooled_output)
238
+
239
+ return image_features
zhclip/processing_zhclip.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2021 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Processor class for ZH-CLIP
17
+ """
18
+
19
+ import warnings
20
+
21
+ from transformers.processing_utils import ProcessorMixin
22
+ from transformers.tokenization_utils_base import BatchEncoding
23
+
24
+
25
+ class ZhCLIPProcessor(ProcessorMixin):
26
+ r"""
27
+ Constructs a VLE processor which wraps an image processor and a tokenizer into a single
28
+ processor.
29
+
30
+ [`VLEProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`AutoTokenizer`].
31
+ See the [`~VLEProcessor.__call__`] and [`~VLEProcessor.decode`] for more
32
+ information.
33
+
34
+ Args:
35
+ image_processor ([`AutoImageProcessor`]):
36
+ The image processor is a required input.
37
+ tokenizer ([`PreTrainedTokenizer`]):
38
+ The tokenizer is a required input.
39
+ """
40
+ attributes = ["image_processor", "tokenizer"]
41
+ image_processor_class = "CLIPImageProcessor"
42
+ tokenizer_class = "BertTokenizer"
43
+
44
+ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
45
+ if "feature_extractor" in kwargs:
46
+ warnings.warn(
47
+ "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
48
+ " instead.",
49
+ FutureWarning,
50
+ )
51
+ feature_extractor = kwargs.pop("feature_extractor")
52
+
53
+ image_processor = image_processor if image_processor is not None else feature_extractor
54
+ if image_processor is None:
55
+ raise ValueError("You need to specify an `image_processor`.")
56
+ if tokenizer is None:
57
+ raise ValueError("You need to specify a `tokenizer`.")
58
+
59
+ super().__init__(image_processor, tokenizer)
60
+ self.current_processor = self.image_processor
61
+
62
+ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
63
+ """
64
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
65
+ and `kwargs` arguments to VLETokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not
66
+ `None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
67
+ AutoImageProcessor's [`~AutoImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
68
+ of the above two methods for more information.
69
+
70
+ Args:
71
+ text (`str`, `List[str]`, `List[List[str]]`):
72
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
73
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
74
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
75
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
76
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
77
+ tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
78
+ number of channels, H and W are image height and width.
79
+
80
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
81
+ If set, will return tensors of a particular framework. Acceptable values are:
82
+
83
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
84
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
85
+ - `'np'`: Return NumPy `np.ndarray` objects.
86
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
87
+
88
+ Returns:
89
+ [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
90
+
91
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
92
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
93
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
94
+ `None`).
95
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
96
+ """
97
+
98
+ if text is None and images is None:
99
+ raise ValueError("You have to specify either text or images. Both cannot be none.")
100
+
101
+ if text is not None:
102
+ encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
103
+
104
+ if images is not None:
105
+ image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
106
+
107
+ if text is not None and images is not None:
108
+ encoding["pixel_values"] = image_features.pixel_values
109
+ return encoding
110
+ elif text is not None:
111
+ return encoding
112
+ else:
113
+ return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
114
+
115
+ @property
116
+ def model_input_names(self):
117
+ tokenizer_input_names = self.tokenizer.model_input_names
118
+ image_processor_input_names = self.image_processor.model_input_names
119
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
120
+
121
+ @property
122
+ def feature_extractor_class(self):
123
+ warnings.warn(
124
+ "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
125
+ FutureWarning,
126
+ )
127
+ return self.image_processor_class
128
+
129
+ @property
130
+ def feature_extractor(self):
131
+ warnings.warn(
132
+ "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
133
+ FutureWarning,
134
+ )
135
+ return self.image_processor