Image-Text-to-Text
Transformers
Safetensors
Portuguese
tinyllava
text-generation
vision
custom_code
nicholasKluge commited on
Commit
9e57926
·
verified ·
1 Parent(s): 46926cc

Upload modeling_tinyllava_tucano.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. modeling_tinyllava_tucano.py +627 -0
modeling_tinyllava_tucano.py ADDED
@@ -0,0 +1,627 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is a modified version of:
2
+ #- [modeling_tinyllava_phi.py](https://huggingface.co/tinyllava/TinyLLaVA-Phi-2-SigLIP-3.1B/blob/main/modeling_tinyllava_phi.py)
3
+ # Made by TinyLLaVA for the tinyllava/TinyLLaVA-Phi-2-SigLIP-3.1B.
4
+ import time
5
+
6
+ import dataclasses
7
+ from enum import auto, Enum
8
+ from typing import List, Tuple, Optional, Union
9
+ import requests
10
+ from PIL import Image
11
+ from io import BytesIO
12
+ import base64
13
+ import re
14
+
15
+ import torch
16
+ import torch.utils.checkpoint
17
+ from torch import nn
18
+ from torch.nn import functional as F
19
+
20
+ from transformers.utils import logging
21
+ from transformers import PreTrainedModel
22
+ from transformers.modeling_outputs import CausalLMOutputWithPast
23
+ from transformers.generation.utils import GenerateOutput
24
+ from transformers import CLIPVisionModel, CLIPImageProcessor, SiglipVisionModel, SiglipImageProcessor
25
+
26
+ from .configuration import TinyLlavaConfig, IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
27
+
28
+ from transformers import AutoConfig, AutoModelForCausalLM, LlamaForCausalLM
29
+
30
+
31
+
32
+ logger = logging.get_logger(__name__)
33
+
34
+ # Model Constants
35
+ IGNORE_INDEX = -100
36
+ IMAGE_TOKEN_INDEX = -200
37
+ DEFAULT_IMAGE_TOKEN = "<image>"
38
+ DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
39
+ DEFAULT_IM_START_TOKEN = "<im_start>"
40
+ DEFAULT_IM_END_TOKEN = "<im_end>"
41
+ IMAGE_PLACEHOLDER = "<image-placeholder>"
42
+
43
+ CONTROLLER_HEART_BEAT_EXPIRATION = 30
44
+ WORKER_HEART_BEAT_INTERVAL = 15
45
+ LOGDIR = "."
46
+
47
+
48
+ class SeparatorStyle(Enum):
49
+ """Different separator style."""
50
+ SINGLE = auto()
51
+ TWO = auto()
52
+ MPT = auto()
53
+ PLAIN = auto()
54
+ LLAMA_2 = auto()
55
+ TINY_LLAMA = auto()
56
+ QWEN_2 = auto()
57
+
58
+
59
+ @dataclasses.dataclass
60
+ class Conversation:
61
+ """A class that keeps all conversation history."""
62
+ system: str
63
+ roles: List[str]
64
+ messages: List[List[str]]
65
+ offset: int
66
+ sep_style: SeparatorStyle = SeparatorStyle.SINGLE
67
+ sep: str = "###"
68
+ sep2: str = None
69
+ version: str = "Unknown"
70
+
71
+ skip_next: bool = False
72
+
73
+ def get_prompt(self):
74
+ messages = self.messages
75
+ if len(messages) > 0 and type(messages[0][1]) is tuple:
76
+ messages = self.messages.copy()
77
+ init_role, init_msg = messages[0].copy()
78
+ init_msg = init_msg[0].replace("<image>", "").strip()
79
+ if 'mmtag' in self.version:
80
+ messages[0] = (init_role, init_msg)
81
+ messages.insert(0, (self.roles[0], "<Image><image></Image>"))
82
+ messages.insert(1, (self.roles[1], "Received."))
83
+ else:
84
+ messages[0] = (init_role, "<image>\n" + init_msg)
85
+
86
+ if self.sep_style == SeparatorStyle.TWO:
87
+ seps = [self.sep, self.sep2]
88
+ ret = self.system + seps[0]
89
+ for i, (role, message) in enumerate(messages):
90
+ if message:
91
+ if type(message) is tuple:
92
+ message, _, _ = message
93
+ ret += role + ": " + message + seps[i % 2]
94
+ else:
95
+ ret += role + ":"
96
+ else:
97
+ raise ValueError(f"Invalid style: {self.sep_style}")
98
+
99
+ return ret
100
+
101
+ def append_message(self, role, message):
102
+ self.messages.append([role, message])
103
+
104
+ def copy(self):
105
+ return Conversation(
106
+ system=self.system,
107
+ roles=self.roles,
108
+ messages=[[x, y] for x, y in self.messages],
109
+ offset=self.offset,
110
+ sep_style=self.sep_style,
111
+ sep=self.sep,
112
+ sep2=self.sep2,
113
+ version=self.version)
114
+
115
+
116
+ conv_tucano_v0 = Conversation(
117
+ system="Um bate-papo entre um usuário curioso e um assistente de inteligência artificial. "
118
+ "O assistente dá respostas úteis, detalhadas e educadas às perguntas do usuário.",
119
+ roles=("\n Usuário", "\n Assistente"),
120
+ version="llama",
121
+ messages=(),
122
+ offset=0,
123
+ sep_style=SeparatorStyle.TWO,
124
+ sep=" ",
125
+ sep2="</s>",
126
+ )
127
+
128
+
129
+ def load_image_from_base64(image):
130
+ return Image.open(BytesIO(base64.b64decode(image)))
131
+
132
+
133
+ def expand2square(pil_img, background_color):
134
+ width, height = pil_img.size
135
+ if width == height:
136
+ return pil_img
137
+ elif width > height:
138
+ result = Image.new(pil_img.mode, (width, width), background_color)
139
+ result.paste(pil_img, (0, (width - height) // 2))
140
+ return result
141
+ else:
142
+ result = Image.new(pil_img.mode, (height, height), background_color)
143
+ result.paste(pil_img, ((height - width) // 2, 0))
144
+ return result
145
+
146
+
147
+ def process_images(images, image_processor, model_cfg):
148
+ image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
149
+ new_images = []
150
+ if image_aspect_ratio == 'pad':
151
+ for image in images:
152
+ image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
153
+ image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
154
+ new_images.append(image)
155
+ else:
156
+ return image_processor(images, return_tensors='pt')['pixel_values']
157
+ if all(x.shape == new_images[0].shape for x in new_images):
158
+ new_images = torch.stack(new_images, dim=0)
159
+ return new_images
160
+
161
+
162
+ def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
163
+ prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
164
+
165
+ def insert_separator(X, sep):
166
+ return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
167
+
168
+ input_ids = []
169
+ offset = 0
170
+ if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
171
+ offset = 1
172
+ input_ids.append(prompt_chunks[0][0])
173
+
174
+ for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
175
+ input_ids.extend(x[offset:])
176
+
177
+ if return_tensors is not None:
178
+ if return_tensors == 'pt':
179
+ return torch.tensor(input_ids, dtype=torch.long)
180
+ raise ValueError(f'Unsupported tensor type: {return_tensors}')
181
+ return input_ids
182
+
183
+ def load_image(image_file):
184
+ if image_file.startswith("http") or image_file.startswith("https"):
185
+ response = requests.get(image_file)
186
+ image = Image.open(BytesIO(response.content)).convert("RGB")
187
+ else:
188
+ image = Image.open(image_file).convert("RGB")
189
+ return image
190
+
191
+ ACT_TYPE = {
192
+ 'relu': nn.ReLU,
193
+ 'gelu': nn.GELU
194
+ }
195
+
196
+ class Connector(nn.Module):
197
+ def __init__(self, config=None):
198
+ super().__init__()
199
+ mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', config.connector_type)
200
+ act_type = config.connector_type.split('_')[-1]
201
+ mlp_depth = int(mlp_gelu_match.group(1))
202
+ modules = [nn.Linear(config.vision_hidden_size, config.hidden_size)]
203
+ for _ in range(1, mlp_depth):
204
+ modules.append(ACT_TYPE[act_type]())
205
+ modules.append(nn.Linear(config.hidden_size, config.hidden_size))
206
+
207
+ self._connector = nn.Sequential(*modules)
208
+
209
+ def forward(self, x):
210
+ return self._connector(x)
211
+
212
+ class VisionTower(nn.Module):
213
+ def __init__(self, cfg, model_name_or_path = 'clip'):
214
+ super().__init__()
215
+ if 'clip' in model_name_or_path:
216
+ self._vision_tower = CLIPVisionModel(cfg)
217
+ self._image_processor = CLIPImageProcessor.from_pretrained(cfg.model_name_or_path)
218
+ else:
219
+ self._vision_tower = SiglipVisionModel(cfg)
220
+ self._image_processor = SiglipImageProcessor.from_pretrained(cfg.model_name_or_path)
221
+
222
+ self.config = cfg
223
+
224
+ def forward(self, x, **kwargs):
225
+ image_features = self._vision_tower(x, output_hidden_states=True)
226
+ image_features = image_features.hidden_states[kwargs.get('vision_feature_layer', -2)]
227
+
228
+ if kwargs.get('vision_feature_select_strategy', 'patch') == 'patch':
229
+ image_features = image_features[:, 1:]
230
+ elif kwargs.get('vision_feature_select_strategy', 'patch') == 'cls_patch':
231
+ image_features = image_features
232
+ else:
233
+ raise ValueError(f"Unexpected select feature: {kwargs.get('vision_feature_select_strategy')}")
234
+
235
+ return image_features
236
+
237
+ @property
238
+ def vision_tower(self):
239
+ return self._vision_tower
240
+
241
+ @vision_tower.setter
242
+ def vision_tower(self, vision_tower):
243
+ self._vision_tower = vision_tower
244
+
245
+ def get_value_from_kwargs(kwargs, name):
246
+ if name in kwargs:
247
+ return kwargs.pop(name)
248
+ else:
249
+ return None
250
+
251
+
252
+ class TinyLlavaPreTrainedModel(PreTrainedModel):
253
+ config_class = TinyLlavaConfig
254
+ base_model_prefix = "model"
255
+ supports_gradient_checkpointing = True
256
+ _no_split_modules = ["LlavaVisionAttention"]
257
+ _skip_keys_device_placement = "past_key_values"
258
+ _supports_flash_attn_2 = True
259
+
260
+ def _init_weights(self, module):
261
+ std = (
262
+ self.config.initializer_range
263
+ if hasattr(self.config, "initializer_range")
264
+ else self.config.text_config.initializer_range
265
+ )
266
+
267
+ if hasattr(module, "class_embedding"):
268
+ module.class_embedding.data.normal_(mean=0.0, std=std)
269
+
270
+ if isinstance(module, (nn.Linear, nn.Conv2d)):
271
+ module.weight.data.normal_(mean=0.0, std=std)
272
+ if module.bias is not None:
273
+ module.bias.data.zero_()
274
+ elif isinstance(module, nn.Embedding):
275
+ module.weight.data.normal_(mean=0.0, std=std)
276
+ if module.padding_idx is not None:
277
+ module.weight.data[module.padding_idx].zero_()
278
+
279
+ @property
280
+ def _supports_sdpa(self):
281
+ return self.language_model._supports_sdpa
282
+
283
+
284
+ class TinyLlavaForConditionalGeneration(TinyLlavaPreTrainedModel):
285
+ def __init__(self, config: TinyLlavaConfig):
286
+
287
+ super().__init__(config)
288
+
289
+ self.language_model = LlamaForCausalLM(config.text_config)
290
+ self.vision_tower = VisionTower(config.vision_config, config.vision_model_name_or_path)
291
+ self.connector = Connector(config)
292
+ self.post_init()
293
+
294
+
295
+ def get_input_embeddings(self):
296
+ return self.language_model.get_input_embeddings()
297
+
298
+ def set_input_embeddings(self, value):
299
+ self.language_model.set_input_embeddings(value)
300
+
301
+ def get_output_embeddings(self):
302
+ return self.language_model.get_output_embeddings()
303
+
304
+ def set_output_embeddings(self, new_embeddings):
305
+ self.language_model.set_output_embeddings(new_embeddings)
306
+
307
+ def set_decoder(self, decoder):
308
+ self.language_model.set_decoder(decoder)
309
+
310
+ def get_decoder(self):
311
+ return self.language_model.get_decoder()
312
+
313
+ def tie_weights(self):
314
+ return self.language_model.tie_weights()
315
+
316
+ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
317
+ model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
318
+ # update vocab size
319
+ self.config.text_config.vocab_size = model_embeds.num_embeddings
320
+ self.config.vocab_size = model_embeds.num_embeddings
321
+ self.vocab_size = model_embeds.num_embeddings
322
+ return model_embeds
323
+
324
+
325
+ def forward(
326
+ self,
327
+ input_ids: torch.LongTensor = None,
328
+ attention_mask: Optional[torch.Tensor] = None,
329
+ position_ids: Optional[torch.LongTensor] = None,
330
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
331
+ inputs_embeds: Optional[torch.FloatTensor] = None,
332
+ labels: Optional[torch.LongTensor] = None,
333
+ use_cache: Optional[bool] = None,
334
+ output_attentions: Optional[bool] = None,
335
+ output_hidden_states: Optional[bool] = None,
336
+ images: Optional[torch.FloatTensor] = None,
337
+ image_sizes: Optional[List[List[int]]] = None,
338
+ return_dict: Optional[bool] = None,
339
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
340
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
341
+ if inputs_embeds is None:
342
+ (
343
+ input_ids,
344
+ position_ids,
345
+ attention_mask,
346
+ past_key_values,
347
+ inputs_embeds,
348
+ labels
349
+ ) = self.prepare_inputs_labels_for_multimodal(
350
+ input_ids,
351
+ position_ids,
352
+ attention_mask,
353
+ past_key_values,
354
+ labels,
355
+ images,
356
+ image_sizes
357
+ )
358
+ return self.language_model.forward(
359
+ input_ids=input_ids,
360
+ attention_mask=attention_mask,
361
+ position_ids=position_ids,
362
+ past_key_values=past_key_values,
363
+ inputs_embeds=inputs_embeds,
364
+ labels=labels,
365
+ use_cache=use_cache,
366
+ output_attentions=output_attentions,
367
+ output_hidden_states=output_hidden_states,
368
+ return_dict=return_dict
369
+ )
370
+
371
+ @torch.no_grad()
372
+ def generate(
373
+ self,
374
+ inputs: Optional[torch.Tensor] = None,
375
+ images: Optional[torch.Tensor] = None,
376
+ image_sizes: Optional[torch.Tensor] = None,
377
+ **kwargs,
378
+ ) -> Union[GenerateOutput, torch.LongTensor]:
379
+ position_ids = kwargs.pop("position_ids", None)
380
+ attention_mask = kwargs.pop("attention_mask", None)
381
+ if "inputs_embeds" in kwargs:
382
+ raise NotImplementedError("`inputs_embeds` is not supported")
383
+
384
+ if images is not None:
385
+ (
386
+ inputs,
387
+ position_ids,
388
+ attention_mask,
389
+ _,
390
+ inputs_embeds,
391
+ _
392
+ ) = self.prepare_inputs_labels_for_multimodal(
393
+ inputs,
394
+ position_ids,
395
+ attention_mask,
396
+ None,
397
+ None,
398
+ images,
399
+ image_sizes=image_sizes
400
+ )
401
+ else:
402
+ inputs_embeds = self.language_model.get_input_embeddings()(inputs)
403
+
404
+ return self.language_model.generate(
405
+ position_ids=position_ids,
406
+ attention_mask=attention_mask,
407
+ inputs_embeds=inputs_embeds,
408
+ **kwargs
409
+ )
410
+
411
+ def encode_images(self, images):
412
+ kwargs = {}
413
+ kwargs['vision_feature_layer'] = self.config.vision_feature_layer
414
+ kwargs['vision_feature_select_strategy'] = self.config.vision_feature_select_strategy
415
+ images = images.to(device=self.device, dtype=self.dtype)
416
+ image_features = self.vision_tower(images, **kwargs)
417
+ image_features = self.connector(image_features)
418
+ return image_features
419
+
420
+
421
+
422
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
423
+ inputs_embeds=None, **kwargs):
424
+ images = kwargs.pop("images", None)
425
+ image_sizes = kwargs.pop("image_sizes", None)
426
+ inputs = self.language_model.prepare_inputs_for_generation(
427
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
428
+ )
429
+ if images is not None:
430
+ inputs['images'] = images
431
+ if image_sizes is not None:
432
+ inputs['image_sizes'] = image_sizes
433
+ return inputs
434
+
435
+ def prepare_inputs_labels_for_multimodal(
436
+ self, input_ids, position_ids, attention_mask, past_key_values, labels,
437
+ images, image_sizes=None
438
+ ):
439
+ vision_tower = self.vision_tower
440
+ if vision_tower is None or images is None or input_ids.shape[1] == 1:
441
+ return input_ids, position_ids, attention_mask, past_key_values, None, labels
442
+
443
+
444
+ image_features = self.encode_images(images)
445
+
446
+ # TODO: image start / end is not implemented here to support pretraining.
447
+ if getattr(self.config, 'tune_mm_mlp_adapter', False):
448
+ raise NotImplementedError
449
+
450
+ # Let's just add dummy tensors if they do not exist,
451
+ # it is a headache to deal with None all the time.
452
+ # But it is not ideal, and if you have a better idea,
453
+ # please open an issue / submit a PR, thanks.
454
+ _labels = labels
455
+ _position_ids = position_ids
456
+ _attention_mask = attention_mask
457
+ if attention_mask is None:
458
+ attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
459
+ else:
460
+ attention_mask = attention_mask.bool()
461
+ if position_ids is None:
462
+ position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
463
+ if labels is None:
464
+ labels = torch.full_like(input_ids, IGNORE_INDEX)
465
+
466
+ # remove the padding using attention_mask -- FIXME
467
+ _input_ids = input_ids
468
+ input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
469
+ labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
470
+
471
+ new_input_embeds = []
472
+ new_labels = []
473
+ cur_image_idx = 0
474
+ for batch_idx, cur_input_ids in enumerate(input_ids):
475
+ num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
476
+ if num_images == 0:
477
+ cur_image_features = image_features[cur_image_idx]
478
+ cur_input_embeds_1 = self.language_model.get_input_embeddings()(cur_input_ids)
479
+ cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
480
+ new_input_embeds.append(cur_input_embeds)
481
+ new_labels.append(labels[batch_idx])
482
+ cur_image_idx += 1
483
+ continue
484
+
485
+ image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
486
+ cur_input_ids_noim = []
487
+ cur_labels = labels[batch_idx]
488
+ cur_labels_noim = []
489
+ for i in range(len(image_token_indices) - 1):
490
+ cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
491
+ cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
492
+ split_sizes = [x.shape[0] for x in cur_labels_noim]
493
+ cur_input_embeds = self.language_model.get_input_embeddings()(torch.cat(cur_input_ids_noim))
494
+ cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
495
+ cur_new_input_embeds = []
496
+ cur_new_labels = []
497
+
498
+ for i in range(num_images + 1):
499
+ cur_new_input_embeds.append(cur_input_embeds_no_im[i])
500
+ cur_new_labels.append(cur_labels_noim[i])
501
+ if i < num_images:
502
+ cur_image_features = image_features[cur_image_idx]
503
+ cur_image_idx += 1
504
+ cur_new_input_embeds.append(cur_image_features)
505
+ cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
506
+
507
+ cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
508
+
509
+ cur_new_input_embeds = torch.cat(cur_new_input_embeds)
510
+ cur_new_labels = torch.cat(cur_new_labels)
511
+
512
+ new_input_embeds.append(cur_new_input_embeds)
513
+ new_labels.append(cur_new_labels)
514
+
515
+ # Truncate sequences to max length as image embeddings can make the sequence longer
516
+ tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
517
+ if tokenizer_model_max_length is not None:
518
+ new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
519
+ new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
520
+
521
+ # Combine them
522
+ max_len = max(x.shape[0] for x in new_input_embeds)
523
+ batch_size = len(new_input_embeds)
524
+
525
+ new_input_embeds_padded = []
526
+ new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
527
+ attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
528
+ position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
529
+
530
+ for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
531
+ cur_len = cur_new_embed.shape[0]
532
+ if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
533
+ new_input_embeds_padded.append(torch.cat((
534
+ torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
535
+ cur_new_embed
536
+ ), dim=0))
537
+ if cur_len > 0:
538
+ new_labels_padded[i, -cur_len:] = cur_new_labels
539
+ attention_mask[i, -cur_len:] = True
540
+ position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
541
+ else:
542
+ new_input_embeds_padded.append(torch.cat((
543
+ cur_new_embed,
544
+ torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
545
+ ), dim=0))
546
+ if cur_len > 0:
547
+ new_labels_padded[i, :cur_len] = cur_new_labels
548
+ attention_mask[i, :cur_len] = True
549
+ position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
550
+
551
+ new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
552
+
553
+ if _labels is None:
554
+ new_labels = None
555
+ else:
556
+ new_labels = new_labels_padded
557
+
558
+ if _attention_mask is None:
559
+ attention_mask = None
560
+ else:
561
+ attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
562
+
563
+ if _position_ids is None:
564
+ position_ids = None
565
+
566
+ return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
567
+
568
+ def chat(
569
+ self,
570
+ prompt: str,
571
+ tokenizer = None,
572
+ image: str = None,
573
+ max_new_tokens: int = 512,
574
+ num_beams = 1,
575
+ top_p=None,
576
+ temperature=0
577
+ ):
578
+ image_processor = self.vision_tower._image_processor
579
+
580
+ if image is not None:
581
+ prompt = DEFAULT_IMAGE_TOKEN + '\n' + prompt
582
+ else:
583
+ raise ValueError("Image is required for chat")
584
+
585
+ # Format conversation
586
+ conv = conv_tucano_v0.copy()
587
+ conv.append_message(conv.roles[0], prompt)
588
+ conv.append_message(conv.roles[1], None)
589
+ prompt = conv.get_prompt()
590
+
591
+ # Get image tensor
592
+ image = load_image(image)
593
+ image_tensor = process_images(image, image_processor, self.config).to(self.device)
594
+
595
+ input_ids = (
596
+ tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
597
+ .unsqueeze(0).to(self.device)
598
+ )
599
+ # Generate
600
+ stime = time.time()
601
+
602
+ with torch.inference_mode():
603
+ output_ids = self.generate(
604
+ input_ids,
605
+ images=image_tensor,
606
+ do_sample=True if temperature > 0 else False,
607
+ temperature=temperature,
608
+ top_p=top_p,
609
+ num_beams=num_beams,
610
+ pad_token_id=tokenizer.pad_token_id,
611
+ max_new_tokens=max_new_tokens,
612
+ use_cache=True,
613
+ # stopping_criteria=[stopping_criteria],
614
+ )
615
+
616
+ generation_time = time.time() - stime
617
+ outputs = tokenizer.batch_decode(
618
+ output_ids, skip_special_tokens=True
619
+ )[0]
620
+
621
+ outputs = outputs.strip()
622
+
623
+ return outputs, generation_time
624
+
625
+
626
+ AutoConfig.register("tinyllava", TinyLlavaConfig)
627
+ AutoModelForCausalLM.register(TinyLlavaConfig, TinyLlavaForConditionalGeneration)