Inference error. Replacing the LLM part with Llama-3.1 70B quantized causing error ( RuntimeError: shape mismatch: value tensor of shape [1024] cannot be broadcast to indexing result of shape [1025] )
#75
by
CCRss
- opened
Approach.
- Load MiniCPM
- Load Llama70B-4bit-quantized as model.llm
- Inference with it.
Maybe someone know what is wrong here. Thanks in advance.
Code
# 1. Load MiniCPM
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '6,7'
model = AutoModel.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True)
# 2. Load Llama70B-4bit-quantized as model.llm
from transformers import AutoModelForCausalLM
max_memory = {6: "40GB", 7: "40GB"}
model.llm = AutoModelForCausalLM.from_pretrained(
"/raid/vladimir_albrekht/llm_quantization/models/quantized_models/meta_instruct70B_sft_ci4_11700_AWQ-4bit-g128-gemm",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
device_map="auto",
max_memory=max_memory
)
model = model.to(device='cuda')
# 3. Inference
image = Image.open('/raid/vladimir_albrekht/vision_lm/MiniCPM-V/images/1250_girl.jpg').convert('RGB')
question = 'Суреттегі қыздың басында қандай бас киім?'
msgs = [{'role': 'user', 'content': question}]
res = model.chat(
image=image,
msgs=msgs,
tokenizer=tokenizer,
sampling=True,
temperature=0.5
)
print(res)
Error
""" RuntimeError: shape mismatch: value tensor of shape [1024] cannot be broadcast to indexing result of shape [1025] """
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[16], line 5
2 question = 'Суреттегі қыздың басында қандай бас киім?'
3 msgs = [{'role': 'user', 'content': question}]
----> 5 res = model.chat(
6 image=image,
7 msgs=msgs,
8 tokenizer=tokenizer,
9 sampling=True, # if sampling=False, beam_search will be used by default
10 temperature=0.5,
11 # system_prompt='' # pass system_prompt if needed
12 )
13 print(res)
File /raid/vladimir_albrekht/huggingface/modules/transformers_modules/openbmb/MiniCPM-Llama3-V-2_5/320a581d2195ad4a52140bb427a07f7207aeac6e/modeling_minicpmv.py:345, in MiniCPMV.chat(self, image, msgs, tokenizer, processor, vision_hidden_states, max_new_tokens, sampling, max_inp_length, system_prompt, stream, **kwargs)
341 generation_config.update(
342 (k, kwargs[k]) for k in generation_config.keys() & kwargs.keys()
343 )
344 with torch.inference_mode():
--> 345 res = self.generate(
346 inputs,
347 tokenizer=tokenizer,
348 max_new_tokens=max_new_tokens,
349 vision_hidden_states=vision_hidden_states,
350 stream=stream,
351 decode_text=True,
352 **generation_config
353 )
355 if stream:
356 def stream_gen():
File /raid/vladimir_albrekht/huggingface/modules/transformers_modules/openbmb/MiniCPM-Llama3-V-2_5/320a581d2195ad4a52140bb427a07f7207aeac6e/modeling_minicpmv.py:264, in MiniCPMV.generate(self, model_inputs, tokenizer, vision_hidden_states, stream, **kwargs)
258 else:
259 model_inputs["vision_hidden_states"] = vision_hidden_states
261 (
262 input_embeds,
263 vision_hidden_states,
--> 264 ) = self.get_vllm_embedding(model_inputs)
266 # output_ids = self._decode(input_embeds, tokenizer, **kwargs)
267 if stream:
File /raid/vladimir_albrekht/huggingface/modules/transformers_modules/openbmb/MiniCPM-Llama3-V-2_5/320a581d2195ad4a52140bb427a07f7207aeac6e/modeling_minicpmv.py:110, in MiniCPMV.get_vllm_embedding(self, data)
107 for i in range(B):
108 patch_attn_mask[i, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
--> 110 vision_embedding = self.vpm(all_pixel_values.type(dtype), patch_attention_mask=patch_attn_mask).last_hidden_state
111 vision_embedding = self.resampler(vision_embedding, tgt_sizes)
112 else:
113 # get vision_embedding foreach
File /raid/vladimir_albrekht/anaconda/envs/awq_quant/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File /raid/vladimir_albrekht/anaconda/envs/awq_quant/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File /raid/vladimir_albrekht/anaconda/envs/awq_quant/lib/python3.10/site-packages/transformers/models/idefics2/modeling_idefics2.py:619, in Idefics2VisionTransformer.forward(self, pixel_values, patch_attention_mask, output_attentions, output_hidden_states, return_dict)
610 patch_attention_mask = torch.ones(
611 (
612 batch_size,
(...)
615 )
616 )
617 patch_attention_mask = patch_attention_mask.to(dtype=torch.bool, device=pixel_values.device)
--> 619 hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
621 patch_attention_mask = patch_attention_mask.view(batch_size, -1)
622 # The call to `_upad_input` in `_flash_attention_forward` is expensive
623 # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
624 # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
File /raid/vladimir_albrekht/anaconda/envs/awq_quant/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File /raid/vladimir_albrekht/anaconda/envs/awq_quant/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File /raid/vladimir_albrekht/anaconda/envs/awq_quant/lib/python3.10/site-packages/transformers/models/idefics2/modeling_idefics2.py:182, in Idefics2VisionEmbeddings.forward(self, pixel_values, patch_attention_mask)
179 bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
181 pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
--> 182 position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
184 position_ids = position_ids.to(self.position_embedding.weight.device)
185 embeddings = embeddings + self.position_embedding(position_ids)
RuntimeError: shape mismatch: value tensor of shape [1024] cannot be broadcast to indexing result of shape [1025]