RuntimeError: The size of tensor a (32) must match the size of tensor b (1024) at non-singleton dimension 2
#5
by
Veinnn
- opened
Hi everyone,
I'm trying to fine-tune the AIDC-AI/Ovis2-1B model for X-ray image captioning. I'm using a custom dataset with X-ray images (448x448, 3 channels RGB) and text descriptions.
I'm getting a RuntimeError during the evaluation step. The traceback is:
RuntimeError Traceback (most recent call last)
<ipython-input-83-f245b31d31e3> in <cell line: 1>()
----> 1 trainer.evaluate()
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
4049
4050 eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-> 4051 output = eval_loop(
4052 eval_dataloader,
4053 description="Evaluation",
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
4243
4244 # Prediction step
-> 4245 losses, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
4246 main_input_name = getattr(self.model, "main_input_name", "input_ids")
4247 inputs_decode = (
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in prediction_step(self, model, inputs, prediction_loss_only, ignore_keys)
4459 if has_labels or loss_without_labels:
4460 with self.compute_loss_context_manager():
-> 4461 loss, outputs = self.compute_loss(model, inputs, return_outputs=True)
4462 loss = loss.mean().detach()
4463
/usr/local/lib/python3.10/dist-packages/trl/trainer/sft_trainer.py in compute_loss(self, model, inputs, return_outputs, num_items_in_batch)
556 """
557 mode = "eval" if self.control.should_evaluate else "train"
--> 558 (loss, outputs) = super().compute_loss(
559 model, inputs, return_outputs=True, num_items_in_batch=num_items_in_batch
560 )
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in compute_loss(self, model, inputs, return_outputs, num_items_in_batch)
3707 loss_kwargs["num_items_in_batch"] = num_items_in_batch
3708 inputs = {**inputs, **loss_kwargs}
-> 3709 outputs = model(**inputs)
3710 # Save past state if it exists
3711 # TODO: this needs to be fixed and made cleaner later.
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
1737
1738 # torchrec tests the code consistency with the following code
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1748
1749 result = None
/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py in forward(*args, **kwargs)
821
822 def forward(*args, **kwargs):
--> 823 return model_forward(*args, **kwargs)
824
825 # To act like a decorator so that it can be popped when doing `extract_model_from_parallel`
/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py in __call__(self, *args, **kwargs)
809
810 def __call__(self, *args, **kwargs):
--> 811 return convert_to_fp32(self.model_forward(*args, **kwargs))
812
813 def __getstate__(self):
/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py in decorate_autocast(*args, **kwargs)
42 def decorate_autocast(*args, **kwargs):
43 with autocast_instance:
---> 44 return func(*args, **kwargs)
45
46 decorate_autocast.__script_unsupported = "@autocast() decorator is not supported in script mode" # type: ignore[attr-defined]
/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py in forward(*args, **kwargs)
821
822 def forward(*args, **kwargs):
--> 823 return model_forward(*args, **kwargs)
824
825 # To act like a decorator so that it can be popped when doing `extract_model_from_parallel`
/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py in __call__(self, *args, **kwargs)
809
810 def __call__(self, *args, **kwargs):
--> 811 return convert_to_fp32(self.model_forward(*args, **kwargs))
812
813 def __getstate__(self):
/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py in decorate_autocast(*args, **kwargs)
42 def decorate_autocast(*args, **kwargs):
43 with autocast_instance:
---> 44 return func(*args, **kwargs)
45
46 decorate_autocast.__script_unsupported = "@autocast() decorator is not supported in script mode" # type: ignore[attr-defined]
/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py in forward(*args, **kwargs)
821
822 def forward(*args, **kwargs):
--> 823 return model_forward(*args, **kwargs)
824
825 # To act like a decorator so that it can be popped when doing `extract_model_from_parallel`
/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py in __call__(self, *args, **kwargs)
809
810 def __call__(self, *args, **kwargs):
--> 811 return convert_to_fp32(self.model_forward(*args, **kwargs))
812
813 def __getstate__(self):
/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py in decorate_autocast(*args, **kwargs)
42 def decorate_autocast(*args, **kwargs):
43 with autocast_instance:
---> 44 return func(*args, **kwargs)
45
46 decorate_autocast.__script_unsupported = "@autocast() decorator is not supported in script mode" # type: ignore[attr-defined]
/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py in forward(*args, **kwargs)
821
822 def forward(*args, **kwargs):
--> 823 return model_forward(*args, **kwargs)
824
825 # To act like a decorator so that it can be popped when doing `extract_model_from_parallel`
/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py in __call__(self, *args, **kwargs)
809
810 def __call__(self, *args, **kwargs):
--> 811 return convert_to_fp32(self.model_forward(*args, **kwargs))
812
813 def __getstate__(self):
/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py in decorate_autocast(*args, **kwargs)
42 def decorate_autocast(*args, **kwargs):
43 with autocast_instance:
---> 44 return func(*args, **kwargs)
45
46 decorate_autocast.__script_unsupported = "@autocast() decorator is not supported in script mode" # type: ignore[attr-defined]
/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py in forward(*args, **kwargs)
821
822 def forward(*args, **kwargs):
--> 823 return model_forward(*args, **kwargs)
824
825 # To act like a decorator so that it can be popped when doing `extract_model_from_parallel`
/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py in __call__(self, *args, **kwargs)
809
810 def __call__(self, *args, **kwargs):
--> 811 return convert_to_fp32(self.model_forward(*args, **kwargs))
812
813 def __getstate__(self):
/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py in decorate_autocast(*args, **kwargs)
42 def decorate_autocast(*args, **kwargs):
43 with autocast_instance:
---> 44 return func(*args, **kwargs)
45
46 decorate_autocast.__script_unsupported = "@autocast() decorator is not supported in script mode" # type: ignore[attr-defined]
/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py in forward(*args, **kwargs)
821
822 def forward(*args, **kwargs):
--> 823 return model_forward(*args, **kwargs)
824
825 # To act like a decorator so that it can be popped when doing `extract_model_from_parallel`
/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py in __call__(self, *args, **kwargs)
809
810 def __call__(self, *args, **kwargs):
--> 811 return convert_to_fp32(self.model_forward(*args, **kwargs))
812
813 def __getstate__(self):
/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py in decorate_autocast(*args, **kwargs)
42 def decorate_autocast(*args, **kwargs):
43 with autocast_instance:
---> 44 return func(*args, **kwargs)
45
46 decorate_autocast.__script_unsupported = "@autocast() decorator is not supported in script mode" # type: ignore[attr-defined]
/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py in forward(*args, **kwargs)
821
822 def forward(*args, **kwargs):
--> 823 return model_forward(*args, **kwargs)
824
825 # To act like a decorator so that it can be popped when doing `extract_model_from_parallel`
/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py in __call__(self, *args, **kwargs)
809
810 def __call__(self, *args, **kwargs):
--> 811 return convert_to_fp32(self.model_forward(*args, **kwargs))
812
813 def __getstate__(self):
/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py in decorate_autocast(*args, **kwargs)
42 def decorate_autocast(*args, **kwargs):
43 with autocast_instance:
---> 44 return func(*args, **kwargs)
45
46 decorate_autocast.__script_unsupported = "@autocast() decorator is not supported in script mode" # type: ignore[attr-defined]
/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py in forward(*args, **kwargs)
821
822 def forward(*args, **kwargs):
--> 823 return model_forward(*args, **kwargs)
824
825 # To act like a decorator so that it can be popped when doing `extract_model_from_parallel`
/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py in __call__(self, *args, **kwargs)
809
810 def __call__(self, *args, **kwargs):
--> 811 return convert_to_fp32(self.model_forward(*args, **kwargs))
812
813 def __getstate__(self):
/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py in decorate_autocast(*args, **kwargs)
42 def decorate_autocast(*args, **kwargs):
43 with autocast_instance:
---> 44 return func(*args, **kwargs)
45
46 decorate_autocast.__script_unsupported = "@autocast() decorator is not supported in script mode" # type: ignore[attr-defined]
/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py in forward(*args, **kwargs)
821
822 def forward(*args, **kwargs):
--> 823 return model_forward(*args, **kwargs)
824
825 # To act like a decorator so that it can be popped when doing `extract_model_from_parallel`
/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py in __call__(self, *args, **kwargs)
809
810 def __call__(self, *args, **kwargs):
--> 811 return convert_to_fp32(self.model_forward(*args, **kwargs))
812
813 def __getstate__(self):
/usr/local/lib/python3.10/dist-packages/torch/amp/autocast_mode.py in decorate_autocast(*args, **kwargs)
42 def decorate_autocast(*args, **kwargs):
43 with autocast_instance:
---> 44 return func(*args, **kwargs)
45
46 decorate_autocast.__script_unsupported = "@autocast() decorator is not supported in script mode" # type: ignore[attr-defined]
/usr/local/lib/python3.10/dist-packages/peft/peft_model.py in forward(self, input_ids, attention_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, task_ids, **kwargs)
1717 with self._enable_peft_forward_hooks(**kwargs):
1718 kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args}
-> 1719 return self.base_model(
1720 input_ids=input_ids,
1721 attention_mask=attention_mask,
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
1737
1738 # torchrec tests the code consistency with the following code
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1748
1749 result = None
/usr/local/lib/python3.10/dist-packages/peft/tuners/tuners_utils.py in forward(self, *args, **kwargs)
195
196 def forward(self, *args: Any, **kwargs: Any):
--> 197 return self.model.forward(*args, **kwargs)
198
199 def _pre_injection_hook(self, model: nn.Module, config: PeftConfig, adapter_name: str) -> None:
~/.cache/huggingface/modules/transformers_modules/AIDC-AI/Ovis2-1B/b5c50bc2836fd46a6cd0feb39269eeb5968fac1d/modeling_ovis.py in forward(self, input_ids, attention_mask, labels, pixel_values, **kwargs)
354 ):
355 # assert self.training, "`forward` can only be used in training. For inference, use `generate`."
--> 356 _, inputs_embeds, labels, attention_mask = self.merge_multimodal(
357 text_input_ids=input_ids,
358 text_attention_masks=attention_mask,
~/.cache/huggingface/modules/transformers_modules/AIDC-AI/Ovis2-1B/b5c50bc2836fd46a6cd0feb39269eeb5968fac1d/modeling_ovis.py in merge_multimodal(self, text_input_ids, text_attention_masks, text_labels, pixel_values, left_padding)
396 num_images = [x.shape[0] if x is not None else 0 for x in pixel_values]
397 if sum(num_images) > 0:
--> 398 visual_tokens = self.visual_tokenizer(torch.cat([x for x in pixel_values if x is not None], dim=0))
399 visual_embeds = torch.split(self.get_vte()(visual_tokens).to(dtype=self.dtype, device=input_device),
400 split_size_or_sections=num_images, dim=0)
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
1737
1738 # torchrec tests the code consistency with the following code
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1748
1749 result = None
~/.cache/huggingface/modules/transformers_modules/AIDC-AI/Ovis2-1B/b5c50bc2836fd46a6cd0feb39269eeb5968fac1d/modeling_ovis.py in forward(self, pixel_values)
234
235 def forward(self, pixel_values) -> torch.Tensor: # [BatchSize, ImageShape] -> [BatchSize, #Token, VocabSize]
--> 236 features = self.encode(pixel_values)
237 logits = self.head(features)
238 tokens = self.tokenize(logits)
~/.cache/huggingface/modules/transformers_modules/AIDC-AI/Ovis2-1B/b5c50bc2836fd46a6cd0feb39269eeb5968fac1d/modeling_ovis.py in encode(self, pixel_values)
209
210 def encode(self, pixel_values):
--> 211 output = self.backbone(pixel_values, output_hidden_states=True, return_dict=True)
212 features = output.hidden_states[-1]
213 if self.config.drop_cls_token:
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
1737
1738 # torchrec tests the code consistency with the following code
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1748
1749 result = None
~/.cache/huggingface/modules/transformers_modules/AIDC-AI/Ovis2-1B/b5c50bc2836fd46a6cd0feb39269eeb5968fac1d/modeling_aimv2.py in forward(self, pixel_values, mask, output_hidden_states, return_dict)
182 return_dict = self.config.use_return_dict
183
--> 184 x = self.preprocessor(pixel_values)
185 x, hidden_states = self.trunk(
186 x, mask, output_hidden_states=output_hidden_states
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
1737
1738 # torchrec tests the code consistency with the following code
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1748
1749 result = None
~/.cache/huggingface/modules/transformers_modules/AIDC-AI/Ovis2-1B/b5c50bc2836fd46a6cd0feb39269eeb5968fac1d/modeling_aimv2.py in forward(self, x)
72
73 def forward(self, x: torch.Tensor) -> torch.Tensor:
---> 74 tokens = self.patchifier(x)
75 _, N, _ = tokens.shape
76 pos_embed = self.pos_embed.to(tokens.device)
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
1737
1738 # torchrec tests the code consistency with the following code
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1748
1749 result = None
~/.cache/huggingface/modules/transformers_modules/AIDC-AI/Ovis2-1B/b5c50bc2836fd46a6cd0feb39269eeb5968fac1d/modeling_aimv2.py in forward(self, x)
59 def forward(self, x: torch.Tensor) -> torch.Tensor:
60 x = self.proj(x).flatten(2).transpose(1, 2)
---> 61 x = self.norm(x)
62 return x
63
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _wrapped_call_impl(self, *args, **kwargs)
1734 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1735 else:
-> 1736 return self._call_impl(*args, **kwargs)
1737
1738 # torchrec tests the code consistency with the following code
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1745 or _global_backward_pre_hooks or _global_backward_hooks
1746 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1747 return forward_call(*args, **kwargs)
1748
1749 result = None
~/.cache/huggingface/modules/transformers_modules/AIDC-AI/Ovis2-1B/b5c50bc2836fd46a6cd0feb39269eeb5968fac1d/modeling_aimv2.py in forward(self, x)
20 def forward(self, x: torch.Tensor) -> torch.Tensor:
21 output = self._norm(x.float()).type_as(x)
---> 22 return output * self.weight
23
24 def extra_repr(self) -> str:
RuntimeError: The size of tensor a (32) must match the size of tensor b (1024) at non-singleton dimension 2```
I've already tried the following:
Verified that config.visual_tokenizer_config.backbone_config.image_size = 448
Verified that config.visual_tokenizer_config.backbone_config.patch_size = 14
Verified that config.visual_tokenizer_config.backbone_config.hidden_size = 1024
Verified that config.visual_tokenizer_config.backbone_config.num_channels = 3
Set tokenizer.model_max_length = model.config.multimodal_max_length = 32768
Added truncation to the data collator
Checked the shape and dtype of input_ids and pixel_values in the data collator (they seem correct)
I'm not sure what else to try. Has anyone else encountered this issue when fine-tuning Ovis2-1B? Any help would be greatly appreciated!