File size: 2,070 Bytes
312e8ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52

""" Main inference generation for mLlama-3.2-11B compressed and packaged as OV model



    -- accompanying generator_class file - ov_mllama_generator_class.py



    -- dependencies:  transformers and torch



"""

import requests
import openvino as ov

from PIL import Image
from transformers import TextStreamer, AutoProcessor
import numpy as np

from ov_mllama_generator_class import OVMLlamaForConditionalGeneration

model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model_dir = "C:\\Users\\darre\\llmware_data\\model_repo\\llama-11b-vision-instruct-ov"

core = ov.Core()

language_model_name = "llm_int4_asym_r10_gs64_max_activation_variance_scale_all_layers.xml"
vision_encoder_name = "openvino_vision_encoder_int8.xml"
device="CPU"

ov_model = OVMLlamaForConditionalGeneration(model_dir, device=device,
                                            language_model_name=language_model_name,
                                            image_encoder_name=vision_encoder_name)

processor = AutoProcessor.from_pretrained(model_dir)

question = "What is unusual on this image?"

messages = [
    {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": question}]},
]
text = processor.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
url = "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11"
raw_image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=text, images=[raw_image], return_tensors="pt")
streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
print(f"Question: {question}")

output = ov_model.generate(**inputs, do_sample=False, max_new_tokens=100, temperature=None, top_p=None, streamer=streamer)
print(f"Visual encoder time {ov_model.vision_encoder_infer_time[0] * 1000 :.2f} ms")
print(f"First token latency {ov_model.llm_infer_time[0] * 1000 :.2f}ms, Second token latency {np.mean(np.array(ov_model.llm_infer_time[1:])) * 1000:.2f}ms")