doberst commited on
Commit
312e8ff
1 Parent(s): b0f4867

Upload 2 files

Browse files
ov_mllama_generator_class.py ADDED
@@ -0,0 +1,518 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """ Core wrapper patching class on mllama-11b OV - excludes all conversion components - and is only for inference.
3
+
4
+ -- Generation loop flows through GenerationMixin - will need to remove torch + transformers
5
+ """
6
+
7
+ from pathlib import Path
8
+ from transformers import AutoConfig, GenerationConfig
9
+
10
+ from typing import Optional, Union, List, Tuple, Dict
11
+ from transformers.generation import GenerationMixin
12
+ from transformers.modeling_outputs import ModelOutput
13
+ import openvino.runtime.opset13 as ops
14
+ import openvino as ov
15
+ import torch
16
+ import numpy as np
17
+ from dataclasses import dataclass
18
+ from openvino.runtime.passes import Manager, MatcherPass, WrapType, Matcher
19
+ import time
20
+
21
+ core = ov.Core()
22
+
23
+ LANGUAGE_MODEL = "llm_int4_asym_r10_gs64_max_activation_variance_scale_all_layers.xml"
24
+ IMAGE_ENCODER = "openvino_vision_encoder_int8.xml"
25
+
26
+ @dataclass
27
+ class MLlamaOutputWithPast(ModelOutput):
28
+ loss: Optional[torch.FloatTensor] = None
29
+ logits: torch.FloatTensor = None
30
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
31
+ hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
32
+ attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
33
+ cross_attn_key_values: Optional[List[torch.FloatTensor]] = None
34
+
35
+
36
+ class InsertSlice(MatcherPass):
37
+ def __init__(self):
38
+ MatcherPass.__init__(self)
39
+ self.model_changed = False
40
+
41
+ param = WrapType("opset10.Result")
42
+
43
+ def callback(matcher: Matcher) -> bool:
44
+ root = matcher.get_match_root()
45
+ if root is None:
46
+ return False
47
+ if len(root.get_output_partial_shape(0)) == 3:
48
+ parent = root.input_value(0).get_node()
49
+ grand_parent = parent.input_value(0).get_node()
50
+
51
+ grand_parent_output = parent.input(0).get_source_output()
52
+ consumers = grand_parent_output.get_target_inputs()
53
+ start = np.array([0, -1, 0], dtype=np.int32)
54
+ stop = np.array([1, -2, grand_parent_output.get_partial_shape()[-1].get_length()], dtype=np.int32)
55
+ step = np.array([1, -1, 1], dtype=np.int32)
56
+ axes = np.array([0, 1, 2], dtype=np.int32)
57
+ slice = ops.slice(grand_parent, start, stop, step, axes, name="inserted_slice")
58
+ for consumer in consumers:
59
+ consumer.replace_source_output(slice.output(0))
60
+ self.model_changed = True
61
+ # Use new operation for additional matching
62
+ self.register_new_node(slice)
63
+ print("applied slice for lm head")
64
+
65
+ return True
66
+
67
+ self.register_matcher(Matcher(param, "InsertSlice"), callback)
68
+
69
+
70
+ STR_TO_OV_TYPE = {
71
+ "boolean": ov.Type.boolean,
72
+ "f16": ov.Type.f16,
73
+ "f32": ov.Type.f32,
74
+ "f64": ov.Type.f64,
75
+ "i8": ov.Type.i8,
76
+ "i16": ov.Type.i16,
77
+ "i32": ov.Type.i32,
78
+ "i64": ov.Type.i64,
79
+ "u8": ov.Type.u8,
80
+ "u16": ov.Type.u16,
81
+ "u32": ov.Type.u32,
82
+ "u64": ov.Type.u64,
83
+ "bf16": ov.Type.bf16,
84
+ }
85
+
86
+
87
+ class OVMLlamaForConditionalGeneration(GenerationMixin):
88
+ def __init__(
89
+ self,
90
+ model_dir: Union[str, Path],
91
+ device: str = "CPU",
92
+ ov_config: Optional[Dict[str, str]] = None,
93
+ language_model_name=None,
94
+ image_encoder_name=None,
95
+ slice_lm_head=True,
96
+ use_remote_tensors=True,
97
+ dynamic_shape=False,
98
+ ):
99
+ model_dir = Path(model_dir)
100
+ self.config = AutoConfig.from_pretrained(model_dir)
101
+ self.generation_config = GenerationConfig.from_pretrained(model_dir)
102
+ self.main_input_name = "input_ids"
103
+ self.device = torch.device("cpu")
104
+ self._device = device
105
+ self.ov_config = ov_config
106
+ self.num_pkv = 2
107
+ self._supports_cache_class = False
108
+ self.next_beam_idx = None
109
+ self._past_length = None
110
+ if language_model_name:
111
+ self.model = core.read_model(model_dir / language_model_name)
112
+ else:
113
+ self.model = core.read_model(model_dir / LANGUAGE_MODEL)
114
+ if image_encoder_name:
115
+ self.vision_model = core.read_model(model_dir / image_encoder_name)
116
+ else:
117
+ self.vision_model = core.read_model(model_dir / IMAGE_ENCODER)
118
+ if not dynamic_shape:
119
+ self.reshape_vision_model()
120
+ self.update_pkv_precision()
121
+ if slice_lm_head:
122
+ self.slice_lm_head()
123
+ self.input_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.inputs)}
124
+ self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)}
125
+ self.lm_cross_attn_inputs = [key for key in self.input_names if "cross_attn_key_values" in key]
126
+ compiled_model = core.compile_model(self.model, device, ov_config)
127
+ self.request = compiled_model.create_infer_request()
128
+ self.cross_attn_outputs = [key.get_any_name() for key in self.vision_model.outputs if "cross_attn_key_values" in key.get_any_name()]
129
+ compiled_vision_model = core.compile_model(self.vision_model, device, ov_config)
130
+ self.vision_request = compiled_vision_model.create_infer_request()
131
+ self.use_remote_tensors = use_remote_tensors and self._device == "GPU"
132
+ if self.use_remote_tensors:
133
+ self.prepare_remote_tensors()
134
+ self.next_beam_idx = None
135
+ self.num_patches = (self.config.vision_config.image_size // self.config.vision_config.patch_size) ** 2 + 1
136
+ self._past_length = 0
137
+ self.llm_infer_time = []
138
+ self.vision_encoder_infer_time = []
139
+
140
+ def _get_past_length(self, past_key_values=None):
141
+ if past_key_values is None:
142
+ return 0
143
+ return self._past_length
144
+
145
+ def reshape_vision_model(self):
146
+ self.vision_model.reshape(
147
+ {
148
+ 0: ov.PartialShape([1, 1, 4, 3, self.config.vision_config.image_size, self.config.vision_config.image_size]),
149
+ 1: ov.PartialShape([1, 1]),
150
+ 2: ov.PartialShape([1, 1, 4]),
151
+ }
152
+ )
153
+
154
+ def update_pkv_precision(self, force_fp32=False):
155
+ pkv_precision = ov.Type.f32
156
+ if not force_fp32:
157
+ device = self._device.upper()
158
+ try:
159
+ if "INFERENCE_PRECISION_HINT" in core.get_property(device, "SUPPORTED_PROPERTIES"):
160
+ pkv_precision = core.get_property(device, "INFERENCE_PRECISION_HINT")
161
+ except RuntimeError: # use default precision when get_property fails, e.g. when device is "AUTO:GPU"
162
+ pass
163
+
164
+ # ov_config["INFERENCE_PRECISION_HINT"] may override the prefer precision
165
+ if self.ov_config:
166
+ inference_precision_hint = self.ov_config.get("INFERENCE_PRECISION_HINT", "")
167
+ if inference_precision_hint in STR_TO_OV_TYPE:
168
+ pkv_precision = STR_TO_OV_TYPE[inference_precision_hint]
169
+
170
+ ppp = ov.preprocess.PrePostProcessor(self.model)
171
+ for key in self.model.inputs:
172
+ if "cross_attn_key_values" in key.get_any_name() and pkv_precision != key.get_element_type():
173
+ ppp.input(key.get_any_name()).tensor().set_element_type(pkv_precision)
174
+
175
+ self.model = ppp.build()
176
+
177
+ ppp_v = ov.preprocess.PrePostProcessor(self.vision_model)
178
+ for key in self.vision_model.outputs:
179
+ if "cross_attn_key_values" in key.get_any_name() and pkv_precision != key.get_element_type():
180
+ ppp_v.output(key.get_any_name()).tensor().set_element_type(pkv_precision)
181
+ self.vision_model = ppp_v.build()
182
+ self._pkv_precision = pkv_precision
183
+
184
+ def slice_lm_head(self):
185
+ manager = Manager()
186
+ manager.register_pass(InsertSlice())
187
+ manager.run_passes(self.model)
188
+ self.model.validate_nodes_and_infer_types()
189
+
190
+ def forward(
191
+ self,
192
+ input_ids: torch.LongTensor = None,
193
+ pixel_values: Optional[torch.FloatTensor] = None,
194
+ aspect_ratio_mask: Optional[List[List[int]]] = None,
195
+ aspect_ratio_ids: Optional[torch.Tensor] = None,
196
+ attention_mask: Optional[List[List[List[int]]]] = None,
197
+ cross_attention_mask: Optional[torch.Tensor] = None,
198
+ cross_attention_states: Optional[torch.Tensor] = None,
199
+ position_ids: Optional[torch.LongTensor] = None,
200
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
201
+ inputs_embeds: Optional[torch.FloatTensor] = None,
202
+ labels: Optional[torch.LongTensor] = None,
203
+ use_cache: Optional[bool] = None,
204
+ output_attentions: Optional[bool] = None,
205
+ output_hidden_states: Optional[bool] = None,
206
+ return_dict: Optional[bool] = None,
207
+ cache_position: Optional[torch.LongTensor] = None,
208
+ cross_attn_key_values: Optional[List[torch.Tensor]] = None,
209
+ num_logits_to_keep: int = 0,
210
+ ) -> Union[Tuple, MLlamaOutputWithPast]:
211
+ r"""
212
+ Args:
213
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
214
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
215
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
216
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
217
+
218
+ num_logits_to_keep (`int`, *optional*):
219
+ Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
220
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
221
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
222
+
223
+
224
+ """
225
+
226
+ if (input_ids is None) ^ (inputs_embeds is not None):
227
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one")
228
+
229
+ if pixel_values is not None and inputs_embeds is not None:
230
+ raise ValueError("You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one")
231
+
232
+ if pixel_values is not None and cross_attention_states is not None:
233
+ raise ValueError("`pixel_values` and `cross_attention_states` cannot be provided simultaneously")
234
+
235
+ if pixel_values is not None:
236
+ if aspect_ratio_ids is None:
237
+ raise ValueError("`aspect_ratio_ids` must be provided if `pixel_values` is provided")
238
+ # get vision tokens from vision model
239
+ cross_attn_key_values = self.visual_encoder(pixel_values, aspect_ratio_ids, aspect_ratio_mask)
240
+ cross_attention_mask, full_text_row_masked_out_mask = self._prepare_cross_attention_mask(
241
+ cross_attention_mask,
242
+ past_key_values=past_key_values,
243
+ num_vision_tokens=self.num_patches,
244
+ cross_attention_layers=cross_attn_key_values if past_key_values is not None else None,
245
+ cross_attention_states=((),),
246
+ device=self.device,
247
+ dtype=torch.float32,
248
+ )
249
+
250
+ if cross_attention_mask is not None and cache_position is not None:
251
+ cross_attention_mask = cross_attention_mask[:, :, cache_position]
252
+ full_text_row_masked_out_mask = full_text_row_masked_out_mask[:, :, cache_position]
253
+
254
+ return self.language_model(
255
+ input_ids=input_ids,
256
+ attention_mask=attention_mask,
257
+ position_ids=position_ids,
258
+ cross_attention_mask=cross_attention_mask,
259
+ full_text_row_masked_out_mask=full_text_row_masked_out_mask,
260
+ past_key_values=past_key_values,
261
+ cache_position=cache_position,
262
+ cross_attention_key_values=cross_attn_key_values,
263
+ )
264
+
265
+ def language_model(
266
+ self,
267
+ input_ids,
268
+ attention_mask,
269
+ position_ids,
270
+ cross_attention_mask,
271
+ full_text_row_masked_out_mask,
272
+ past_key_values,
273
+ cache_position,
274
+ cross_attention_key_values,
275
+ ):
276
+ model_inputs = {
277
+ "input_ids": ov.Tensor(np.array(input_ids)),
278
+ "attention_mask": ov.Tensor(np.array(attention_mask)),
279
+ "position_ids": ov.Tensor(np.array(position_ids)),
280
+ "cross_attention_mask": ov.Tensor(np.array(cross_attention_mask)),
281
+ "full_text_row_masked_out_mask": ov.Tensor(np.array(full_text_row_masked_out_mask)),
282
+ "cache_position": ov.Tensor(np.array(cache_position)),
283
+ }
284
+
285
+ if past_key_values is None:
286
+ self.request.reset_state()
287
+ self.next_beam_idx = np.arange(input_ids.shape[0], dtype=int)
288
+ self._past_length = 0
289
+ self.llm_infer_time = []
290
+
291
+ if not self.use_remote_tensors:
292
+ model_inputs.update(dict(zip(self.lm_cross_attn_inputs, cross_attention_key_values)))
293
+ if "beam_idx" in self.input_names:
294
+ model_inputs["beam_idx"] = self.next_beam_idx if self.next_beam_idx is not None else np.arange(input_ids.shape[0], dtype=int)
295
+
296
+ start = time.perf_counter()
297
+ self.request.start_async(model_inputs, share_inputs=True)
298
+ self.request.wait()
299
+ end = time.perf_counter()
300
+ self.llm_infer_time.append(end - start)
301
+ logits = torch.from_numpy(self.request.get_tensor("logits").data)
302
+ past_key_values = ((),)
303
+ self._past_length += input_ids.shape[1]
304
+ out = MLlamaOutputWithPast(logits=logits, past_key_values=past_key_values, cross_attn_key_values=cross_attention_key_values)
305
+ return out
306
+
307
+ def can_generate(self):
308
+ """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate."""
309
+ return True
310
+
311
+ def __call__(self, *args, **kwargs) -> MLlamaOutputWithPast:
312
+ return self.forward(
313
+ *args,
314
+ **kwargs,
315
+ )
316
+
317
+ def _reorder_cache(self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]:
318
+ """
319
+ This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
320
+ [`~PreTrainedModel.beam_sample`] is called.
321
+ This is required to match `past_key_values` with the correct beam_idx at every generation step.
322
+ """
323
+ self.next_beam_idx = np.array(beam_idx) # save beam_idx to be used as an input in the next iteration
324
+ return past_key_values
325
+
326
+ def prepare_inputs_for_generation(
327
+ self,
328
+ input_ids=None,
329
+ inputs_embeds=None,
330
+ attention_mask=None,
331
+ position_ids=None,
332
+ pixel_values=None,
333
+ aspect_ratio_ids=None,
334
+ aspect_ratio_mask=None,
335
+ cross_attention_mask=None,
336
+ past_key_values=None,
337
+ use_cache=False,
338
+ cache_position=None,
339
+ cross_attn_key_values=None,
340
+ num_logits_to_keep=None,
341
+ **kwargs,
342
+ ):
343
+ # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
344
+ # Exception 1: when passing input_embeds, input_ids may be missing entries
345
+ # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
346
+ if past_key_values is not None:
347
+ if inputs_embeds is not None: # Exception 1
348
+ input_ids = input_ids[:, -cache_position.shape[0] :]
349
+ elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2)
350
+ input_ids = input_ids[:, cache_position]
351
+
352
+ if attention_mask is not None and position_ids is None:
353
+ # create position_ids on the fly for batch generation
354
+ position_ids = attention_mask.long().cumsum(-1) - 1
355
+ position_ids.masked_fill_(attention_mask == 0, 1)
356
+ if past_key_values:
357
+ position_ids = position_ids[:, -input_ids.shape[1] :]
358
+
359
+ # The clone here is for the same reason as for `position_ids`.
360
+ model_inputs = {"input_ids": input_ids, "inputs_embeds": None}
361
+
362
+ if num_logits_to_keep is not None:
363
+ model_inputs["num_logits_to_keep"] = num_logits_to_keep
364
+
365
+ model_inputs.update(
366
+ {
367
+ "position_ids": position_ids,
368
+ "cache_position": cache_position,
369
+ "past_key_values": past_key_values,
370
+ "use_cache": use_cache,
371
+ "attention_mask": attention_mask,
372
+ "cross_attention_mask": cross_attention_mask,
373
+ "cross_attn_key_values": cross_attn_key_values,
374
+ }
375
+ )
376
+
377
+ # If we're in pre-fill or cacheless decoding step, then we need pixel_values and aspect ratios
378
+ # to compute image hidden states, otherwise they are cache/home/ea/llama3.2/Llama-3.2-11B-Vision-Early/OVd within each cross attn layer
379
+ if (input_ids == self.config.image_token_index).any():
380
+ model_inputs["pixel_values"] = pixel_values
381
+ model_inputs["aspect_ratio_ids"] = aspect_ratio_ids
382
+ model_inputs["aspect_ratio_mask"] = aspect_ratio_mask
383
+
384
+ return model_inputs
385
+
386
+ def _update_model_kwargs_for_generation(self, outputs, model_kwargs, is_encoder_decoder, **kwargs):
387
+ cross_attention_mask_prev = model_kwargs.get("cross_attention_mask", None)
388
+ model_kwargs = super()._update_model_kwargs_for_generation(
389
+ outputs=outputs,
390
+ model_kwargs=model_kwargs,
391
+ is_encoder_decoder=is_encoder_decoder,
392
+ **kwargs,
393
+ )
394
+
395
+ # add cross-attn mask for new token
396
+ if cross_attention_mask_prev is not None:
397
+ model_kwargs["cross_attention_mask"] = torch.cat([cross_attention_mask_prev, cross_attention_mask_prev[:, -1:, ...]], dim=1)
398
+ model_kwargs["cross_attn_key_values"] = outputs.cross_attn_key_values
399
+ return model_kwargs
400
+
401
+ def _prepare_cross_attention_mask(
402
+ self,
403
+ cross_attention_mask: torch.Tensor,
404
+ past_key_values: Tuple,
405
+ num_vision_tokens: int,
406
+ cross_attention_states: torch.Tensor,
407
+ cross_attention_layers: List[int],
408
+ device: str,
409
+ dtype: str,
410
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
411
+ if cross_attention_mask is None:
412
+ # should we raise error or prepare a full attn mask with all ones?
413
+ return None, None
414
+ else:
415
+ # reshape so it can be used by attn module
416
+ batch_size, text_total_length, *_ = cross_attention_mask.shape
417
+ cross_attention_mask = cross_attention_mask.repeat_interleave(num_vision_tokens, dim=3)
418
+ cross_attention_mask = cross_attention_mask.view(batch_size, text_total_length, -1)
419
+ cross_attention_mask = cross_attention_mask.unsqueeze(1)
420
+
421
+ # invert the mask
422
+ inverted_cross_attn_mask = (1.0 - cross_attention_mask).to(dtype)
423
+ cross_attention_mask = inverted_cross_attn_mask.masked_fill(inverted_cross_attn_mask.to(torch.bool), torch.finfo(dtype).min)
424
+
425
+ # apply full-row bias, which return 4D tensor of shape [B, H, S1, 1] where value is 0 if the a full row in cross attn mask's
426
+ # last dimension contains negative infinity values, otherwise it's 1
427
+ negative_inf_value = torch.finfo(dtype).min
428
+ full_text_row_masked_out_mask = (cross_attention_mask != negative_inf_value).any(dim=-1).type_as(cross_attention_mask)[..., None]
429
+ cross_attention_mask *= full_text_row_masked_out_mask
430
+
431
+ # In case we receive a new image but already have previous cross-attention key/values in cache,
432
+ # then we need to extend the attention-mask and add previous images' lengths
433
+ if past_key_values is not None and cross_attention_states is not None and cross_attention_layers is not None:
434
+ # make all zeros mask for cross-attn-mask from previuos cached hidden_states, all zeros right?
435
+ # i.e. extend current cross-attn-mask on image-seq-length dimension to account for past_seen_tokens
436
+ past_cross_attn_kv_length = cross_attention_layers[0].shape[-2]
437
+ past_cross_attn_mask = torch.zeros((*cross_attention_mask.shape[:-1], past_cross_attn_kv_length), dtype=dtype, device=device)
438
+ # concatenate both on image-seq-length dimension
439
+ cross_attention_mask = torch.cat([past_cross_attn_mask, cross_attention_mask], dim=-1)
440
+
441
+ return cross_attention_mask, full_text_row_masked_out_mask
442
+
443
+ def visual_encoder(self, pixel_values, aspect_ratio_ids, aspect_ratio_mask):
444
+ if pixel_values is not None:
445
+ if aspect_ratio_ids is None:
446
+ raise ValueError("`aspect_ratio_ids` must be provided if `pixel_values` is provided")
447
+ self.vision_encoder_infer_time = []
448
+ start = time.perf_counter()
449
+ # get vision tokens from vision model
450
+ self.vision_request.start_async([pixel_values, aspect_ratio_ids, aspect_ratio_mask], share_inputs=True)
451
+ self.vision_request.wait()
452
+ end = time.perf_counter()
453
+ cross_attn_key_values = [self.vision_request.get_tensor(name) for name in self.cross_attn_outputs]
454
+ self.vision_encoder_infer_time.append(end - start)
455
+ return cross_attn_key_values
456
+
457
+ def prepare_vision_outputs(self, pixel_values, aspect_ratio_ids, aspect_ratio_mask, cross_attention_mask=None, past_key_values=None, cache_position=None):
458
+ cross_attn_key_values = self.visual_encoder(pixel_values, aspect_ratio_ids, aspect_ratio_mask)
459
+ cross_attn_key_values = [v.data for v in cross_attn_key_values]
460
+ cross_attention_mask, full_text_row_masked_out_mask = self._prepare_cross_attention_mask(
461
+ cross_attention_mask,
462
+ past_key_values=past_key_values,
463
+ num_vision_tokens=self.num_patches,
464
+ cross_attention_layers=cross_attn_key_values if past_key_values is not None else None,
465
+ cross_attention_states=1,
466
+ device=self.device,
467
+ dtype=torch.float32,
468
+ )
469
+
470
+ if cross_attention_mask is not None and cache_position is not None:
471
+ cross_attention_mask = cross_attention_mask[:, :, cache_position]
472
+ full_text_row_masked_out_mask = full_text_row_masked_out_mask[:, :, cache_position]
473
+
474
+ return {
475
+ "cross_attention_mask": cross_attention_mask,
476
+ "full_text_row_masked_out_mask": full_text_row_masked_out_mask,
477
+ "past_key_values": past_key_values,
478
+ "cache_position": cache_position,
479
+ "cross_attention_key_values": cross_attn_key_values,
480
+ }
481
+
482
+ def prepare_llm_inputs(
483
+ self,
484
+ input_ids,
485
+ attention_mask,
486
+ position_ids,
487
+ cross_attention_mask,
488
+ full_text_row_masked_out_mask,
489
+ past_key_values,
490
+ cache_position,
491
+ cross_attention_key_values,
492
+ ):
493
+ model_inputs = {
494
+ "input_ids": input_ids,
495
+ "attention_mask": attention_mask,
496
+ "position_ids": position_ids,
497
+ "cross_attention_mask": cross_attention_mask,
498
+ "full_text_row_masked_out_mask": full_text_row_masked_out_mask,
499
+ "cache_position": cache_position,
500
+ }
501
+
502
+ if past_key_values is None:
503
+ self.request.reset_state()
504
+ self.next_beam_idx = np.arange(input_ids.shape[0], dtype=int)
505
+ self._past_length = 0
506
+
507
+ model_inputs.update(dict(zip(self.lm_cross_attn_inputs, cross_attention_key_values)))
508
+ if "beam_idx" in self.input_names:
509
+ model_inputs["beam_idx"] = self.next_beam_idx if self.next_beam_idx is not None else np.arange(input_ids.shape[0], dtype=int)
510
+
511
+ return model_inputs
512
+
513
+ def prepare_remote_tensors(self):
514
+ context = core.get_default_context("GPU")
515
+ for idx, name in enumerate(self.lm_cross_attn_inputs):
516
+ remote_tensor = context.create_tensor(ov.Type.f16, ov.Shape([1, 32, 6404, 128]), {})
517
+ self.vision_request.set_tensor(self.cross_attn_outputs[idx], remote_tensor)
518
+ self.request.set_tensor(name, remote_tensor)
ov_mllama_generator_script.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """ Main inference generation for mLlama-3.2-11B compressed and packaged as OV model
3
+
4
+ -- accompanying generator_class file - ov_mllama_generator_class.py
5
+
6
+ -- dependencies: transformers and torch
7
+
8
+ """
9
+
10
+ import requests
11
+ import openvino as ov
12
+
13
+ from PIL import Image
14
+ from transformers import TextStreamer, AutoProcessor
15
+ import numpy as np
16
+
17
+ from ov_mllama_generator_class import OVMLlamaForConditionalGeneration
18
+
19
+ model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
20
+ model_dir = "C:\\Users\\darre\\llmware_data\\model_repo\\llama-11b-vision-instruct-ov"
21
+
22
+ core = ov.Core()
23
+
24
+ language_model_name = "llm_int4_asym_r10_gs64_max_activation_variance_scale_all_layers.xml"
25
+ vision_encoder_name = "openvino_vision_encoder_int8.xml"
26
+ device="CPU"
27
+
28
+ ov_model = OVMLlamaForConditionalGeneration(model_dir, device=device,
29
+ language_model_name=language_model_name,
30
+ image_encoder_name=vision_encoder_name)
31
+
32
+ processor = AutoProcessor.from_pretrained(model_dir)
33
+
34
+ question = "What is unusual on this image?"
35
+
36
+ messages = [
37
+ {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": question}]},
38
+ ]
39
+ text = processor.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
40
+ url = "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11"
41
+ raw_image = Image.open(requests.get(url, stream=True).raw)
42
+
43
+ inputs = processor(text=text, images=[raw_image], return_tensors="pt")
44
+ streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
45
+ print(f"Question: {question}")
46
+
47
+ output = ov_model.generate(**inputs, do_sample=False, max_new_tokens=100, temperature=None, top_p=None, streamer=streamer)
48
+ print(f"Visual encoder time {ov_model.vision_encoder_infer_time[0] * 1000 :.2f} ms")
49
+ print(f"First token latency {ov_model.llm_infer_time[0] * 1000 :.2f}ms, Second token latency {np.mean(np.array(ov_model.llm_infer_time[1:])) * 1000:.2f}ms")
50
+
51
+