Xenova HF staff commited on
Commit
f2780c4
·
verified ·
1 Parent(s): b48a355

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +266 -3
README.md CHANGED
@@ -1,3 +1,266 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: transformers.js
4
+ base_model: Qwen/Qwen2-VL-2B-Instruct
5
+ ---
6
+
7
+ https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct with ONNX weights to be compatible with Transformers.js.
8
+
9
+ ## Usage (Transformers.js)
10
+
11
+ If you haven't already, you can install the [Transformers.js](https://huggingface.co/docs/transformers.js) JavaScript library from [NPM](https://www.npmjs.com/package/@huggingface/transformers) using:
12
+ ```bash
13
+ npm i @huggingface/transformers
14
+ ```
15
+
16
+ **Example:** TODO
17
+
18
+
19
+ ## ONNX conversion script:
20
+ First, install the following dependencies:
21
+ ```sh
22
+ pip install --upgrade git+https://github.com/huggingface/transformers.git@xenova-patch-2 onnx==1.17.0 onnxruntime==1.20.1 optimum==1.23.3 onnxslim==0.1.42
23
+ ```
24
+
25
+ ```py
26
+ import os
27
+ import torch
28
+ from transformers import (
29
+ AutoProcessor,
30
+ Qwen2VLForConditionalGeneration,
31
+ DynamicCache,
32
+ )
33
+
34
+
35
+ class PatchedQwen2VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
36
+ def forward(self, *args):
37
+ inputs_embeds, attention_mask, position_ids, *past_key_values_args = args
38
+
39
+ # Convert past_key_values list to DynamicCache
40
+ if len(past_key_values_args) == 0:
41
+ past_key_values = None
42
+ else:
43
+ past_key_values = DynamicCache(self.config.num_hidden_layers)
44
+ for i in range(self.config.num_hidden_layers):
45
+ key = past_key_values_args.pop(0)
46
+ value = past_key_values_args.pop(0)
47
+ past_key_values.update(key_states=key, value_states=value, layer_idx=i)
48
+
49
+ o = super().forward(
50
+ inputs_embeds=inputs_embeds,
51
+ attention_mask=attention_mask,
52
+ position_ids=position_ids,
53
+ past_key_values=past_key_values,
54
+ )
55
+
56
+ flattened_past_key_values_outputs = {
57
+ "logits": o.logits,
58
+ }
59
+ output_past_key_values: DynamicCache = o.past_key_values
60
+ for i, (key, value) in enumerate(
61
+ zip(output_past_key_values.key_cache, output_past_key_values.value_cache)
62
+ ):
63
+ flattened_past_key_values_outputs[f"present.{i}.key"] = key
64
+ flattened_past_key_values_outputs[f"present.{i}.value"] = value
65
+
66
+ return flattened_past_key_values_outputs
67
+
68
+
69
+ # Constants
70
+ OUTPUT_FOLDER = "output"
71
+ EMBEDDING_MODEL_NAME = "embed_tokens.onnx"
72
+ TEXT_MODEL_NAME = "decoder_model_merged.onnx"
73
+ VISION_MODEL_NAME = "vision_encoder.onnx"
74
+ TEMP_MODEL_OUTPUT_FOLDER = os.path.join(OUTPUT_FOLDER, "temp")
75
+ FINAL_MODEL_OUTPUT_FOLDER = os.path.join(OUTPUT_FOLDER, "onnx")
76
+
77
+
78
+ # Load model and processor
79
+ model_id = "Qwen/Qwen2-VL-2B-Instruct"
80
+ model = PatchedQwen2VLForConditionalGeneration.from_pretrained(model_id).eval()
81
+ processor = AutoProcessor.from_pretrained(model_id)
82
+
83
+
84
+ # Save model configs and processor
85
+ model.config.save_pretrained(OUTPUT_FOLDER)
86
+ model.generation_config.save_pretrained(OUTPUT_FOLDER)
87
+ processor.save_pretrained(OUTPUT_FOLDER)
88
+ os.makedirs(TEMP_MODEL_OUTPUT_FOLDER, exist_ok=True)
89
+
90
+
91
+ # Configuration values
92
+ ## Text model
93
+ text_config = model.config
94
+ num_heads = text_config.num_attention_heads
95
+ num_key_value_heads = text_config.num_key_value_heads
96
+ head_dim = text_config.hidden_size // num_heads
97
+ num_layers = text_config.num_hidden_layers
98
+ hidden_size = text_config.hidden_size
99
+
100
+ ## Vision model
101
+ vision_config = model.config.vision_config
102
+ channel = vision_config.in_chans
103
+ temporal_patch_size = vision_config.temporal_patch_size
104
+ patch_size = vision_config.spatial_patch_size
105
+
106
+
107
+ # Dummy input sizes
108
+ grid_t, grid_h, grid_w = [1, 16, 16]
109
+ batch_size = 1
110
+ sequence_length = 16
111
+ num_channels = 3
112
+ past_sequence_length = 0
113
+
114
+ image_batch_size = 1 # TODO: Add support for > 1 images
115
+ assert image_batch_size == 1
116
+
117
+
118
+ # Dummy inputs
119
+ ## Embedding inputs
120
+ input_ids = torch.randint(
121
+ 0, model.config.vocab_size, (batch_size, sequence_length), dtype=torch.int64
122
+ )
123
+
124
+ ## Text inputs
125
+ dummy_past_key_values_kwargs = {
126
+ f"past_key_values.{i}.{key}": torch.zeros(
127
+ batch_size,
128
+ num_key_value_heads,
129
+ past_sequence_length,
130
+ head_dim,
131
+ dtype=torch.float32,
132
+ )
133
+ for i in range(num_layers)
134
+ for key in ["key", "value"]
135
+ }
136
+ inputs_embeds = torch.ones(
137
+ batch_size, sequence_length, hidden_size, dtype=torch.float32
138
+ )
139
+ attention_mask = torch.ones(batch_size, sequence_length, dtype=torch.int64)
140
+ position_ids = torch.ones(3, batch_size, sequence_length, dtype=torch.int64)
141
+
142
+ ## Vision inputs
143
+ grid_thw = torch.tensor(
144
+ [[grid_t, grid_h, grid_w]] * image_batch_size, dtype=torch.int64
145
+ )
146
+ pixel_values = torch.randn(
147
+ image_batch_size * grid_t * grid_h * grid_w,
148
+ channel * temporal_patch_size * patch_size * patch_size,
149
+ dtype=torch.float32,
150
+ )
151
+
152
+
153
+ # ONNX Exports
154
+ ## Embedding model
155
+ embedding_inputs = dict(input_ids=input_ids)
156
+ embedding_inputs_positional = tuple(embedding_inputs.values())
157
+ model.model.embed_tokens(*embedding_inputs_positional) # Test forward pass
158
+ EMBED_TOKENS_OUTPUT_PATH = os.path.join(TEMP_MODEL_OUTPUT_FOLDER, EMBEDDING_MODEL_NAME)
159
+ torch.onnx.export(
160
+ model.model.embed_tokens,
161
+ args=embedding_inputs_positional,
162
+ f=EMBED_TOKENS_OUTPUT_PATH,
163
+ export_params=True,
164
+ opset_version=14,
165
+ do_constant_folding=True,
166
+ input_names=list(embedding_inputs.keys()),
167
+ output_names=["inputs_embeds"],
168
+ dynamic_axes={
169
+ "input_ids": {0: "batch_size", 1: "sequence_length"},
170
+ "inputs_embeds": {0: "batch_size", 1: "sequence_length"},
171
+ },
172
+ )
173
+
174
+ ## Text model
175
+ text_inputs = dict(
176
+ inputs_embeds=inputs_embeds,
177
+ attention_mask=attention_mask,
178
+ position_ids=position_ids,
179
+ **dummy_past_key_values_kwargs,
180
+ )
181
+ text_inputs_positional = tuple(text_inputs.values())
182
+ text_outputs = model.forward(*text_inputs_positional) # Test forward pass
183
+ TEXT_MODEL_OUTPUT_PATH=os.path.join(TEMP_MODEL_OUTPUT_FOLDER, TEXT_MODEL_NAME)
184
+ torch.onnx.export(
185
+ model,
186
+ args=text_inputs_positional,
187
+ f=TEXT_MODEL_OUTPUT_PATH,
188
+ export_params=True,
189
+ opset_version=14,
190
+ do_constant_folding=True,
191
+ input_names=list(text_inputs.keys()),
192
+ output_names=["logits"]
193
+ + [f"present.{i}.{key}" for i in range(num_layers) for key in ["key", "value"]],
194
+ dynamic_axes={
195
+ "inputs_embeds": {0: "batch_size", 1: "sequence_length"},
196
+ "attention_mask": {0: "batch_size", 1: "sequence_length"},
197
+ "position_ids": {1: "batch_size", 2: "sequence_length"},
198
+ **{
199
+ f"past_key_values.{i}.{key}": {0: "batch_size", 2: "past_sequence_length"}
200
+ for i in range(num_layers)
201
+ for key in ["key", "value"]
202
+ },
203
+ "logits": {0: "batch_size", 1: "sequence_length"},
204
+ **{
205
+ f"present.{i}.{key}": {0: "batch_size", 2: "past_sequence_length + 1"}
206
+ for i in range(num_layers)
207
+ for key in ["key", "value"]
208
+ },
209
+ },
210
+ )
211
+
212
+ ## Vision model
213
+ vision_inputs = dict(
214
+ pixel_values=pixel_values,
215
+ grid_thw=grid_thw,
216
+ )
217
+ vision_inputs_positional = tuple(vision_inputs.values())
218
+ vision_outputs = model.visual.forward(*vision_inputs_positional) # Test forward pass
219
+ VISION_ENCODER_OUTPUT_PATH = os.path.join(TEMP_MODEL_OUTPUT_FOLDER, VISION_MODEL_NAME)
220
+ torch.onnx.export(
221
+ model.visual,
222
+ args=vision_inputs_positional,
223
+ f=VISION_ENCODER_OUTPUT_PATH,
224
+ export_params=True,
225
+ opset_version=14,
226
+ do_constant_folding=True,
227
+ input_names=list(vision_inputs.keys()),
228
+ output_names=["image_features"],
229
+ dynamic_axes={
230
+ "pixel_values": {
231
+ 0: "batch_size * grid_t * grid_h * grid_w",
232
+ 1: "channel * temporal_patch_size * patch_size * patch_size",
233
+ },
234
+ "grid_thw": {0: "batch_size"},
235
+ "image_features": {0: "batch_size * grid_t * grid_h * grid_w"},
236
+ },
237
+ )
238
+
239
+
240
+ # Post-processing
241
+ import onnx
242
+ import onnxslim
243
+ from optimum.onnx.graph_transformations import check_and_save_model
244
+
245
+ os.makedirs(FINAL_MODEL_OUTPUT_FOLDER, exist_ok=True)
246
+ for name in (EMBEDDING_MODEL_NAME, TEXT_MODEL_NAME, VISION_MODEL_NAME):
247
+ temp_model_path = os.path.join(TEMP_MODEL_OUTPUT_FOLDER, name)
248
+
249
+ ## Shape inference (especially needed by the vision encoder)
250
+ onnx.shape_inference.infer_shapes_path(temp_model_path, check_type=True, strict_mode=True)
251
+
252
+ ## Attempt to optimize the model with onnxslim
253
+ try:
254
+ model = onnxslim.slim(temp_model_path)
255
+ except Exception as e:
256
+ print(f"Failed to slim {model}: {e}")
257
+ model = onnx.load(temp_model_path)
258
+
259
+ ## Save model
260
+ final_model_path = os.path.join(FINAL_MODEL_OUTPUT_FOLDER, name)
261
+ check_and_save_model(model, final_model_path)
262
+
263
+ ## Cleanup
264
+ import shutil
265
+ shutil.rmtree(TEMP_MODEL_OUTPUT_FOLDER)
266
+ ```