openbmb/MiniCPM-Llama3-V-2_5 · Inference error. Replacing the LLM part with Llama-3.1 70B quantized causing error ( RuntimeError: shape mismatch: value tensor of shape [1024] cannot be broadcast to indexing result of shape [1025] )

Approach.

Load MiniCPM
Load Llama70B-4bit-quantized as model.llm
Inference with it.

Maybe someone know what is wrong here. Thanks in advance.

Code

# 1.  Load MiniCPM
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '6,7'

model = AutoModel.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True, torch_dtype=torch.bfloat16)

tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True)

# 2. Load Llama70B-4bit-quantized as model.llm 

from transformers import AutoModelForCausalLM

max_memory = {6: "40GB", 7: "40GB"} 

model.llm = AutoModelForCausalLM.from_pretrained(
    "/raid/vladimir_albrekht/llm_quantization/models/quantized_models/meta_instruct70B_sft_ci4_11700_AWQ-4bit-g128-gemm", 
    trust_remote_code=True, 
    torch_dtype=torch.bfloat16,
    device_map="auto",
    max_memory=max_memory
)
model = model.to(device='cuda')

# 3. Inference 

image = Image.open('/raid/vladimir_albrekht/vision_lm/MiniCPM-V/images/1250_girl.jpg').convert('RGB')
question = 'Суреттегі қыздың басында қандай бас киім?'
msgs = [{'role': 'user', 'content': question}]

res = model.chat(
    image=image,
    msgs=msgs,
    tokenizer=tokenizer,
    sampling=True, 
    temperature=0.5
)
print(res)

Error

""" RuntimeError: shape mismatch: value tensor of shape [1024] cannot be broadcast to indexing result of shape [1025] """
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[16], line 5
      2 question = 'Суреттегі қыздың басында қандай бас киім?'
      3 msgs = [{'role': 'user', 'content': question}]
----> 5 res = model.chat(
      6     image=image,
      7     msgs=msgs,
      8     tokenizer=tokenizer,
      9     sampling=True, # if sampling=False, beam_search will be used by default
     10     temperature=0.5,
     11     # system_prompt='' # pass system_prompt if needed
     12 )
     13 print(res)

File /raid/vladimir_albrekht/huggingface/modules/transformers_modules/openbmb/MiniCPM-Llama3-V-2_5/320a581d2195ad4a52140bb427a07f7207aeac6e/modeling_minicpmv.py:345, in MiniCPMV.chat(self, image, msgs, tokenizer, processor, vision_hidden_states, max_new_tokens, sampling, max_inp_length, system_prompt, stream, **kwargs)
    341 generation_config.update(
    342     (k, kwargs[k]) for k in generation_config.keys() & kwargs.keys()
    343 )
    344 with torch.inference_mode():
--> 345     res = self.generate(
    346         inputs,
    347         tokenizer=tokenizer,
    348         max_new_tokens=max_new_tokens,
    349         vision_hidden_states=vision_hidden_states,
    350         stream=stream,
    351         decode_text=True,
    352         **generation_config
    353     )
    355 if stream:
    356     def stream_gen():

File /raid/vladimir_albrekht/huggingface/modules/transformers_modules/openbmb/MiniCPM-Llama3-V-2_5/320a581d2195ad4a52140bb427a07f7207aeac6e/modeling_minicpmv.py:264, in MiniCPMV.generate(self, model_inputs, tokenizer, vision_hidden_states, stream, **kwargs)
    258 else:
    259     model_inputs["vision_hidden_states"] = vision_hidden_states
    261 (
    262     input_embeds,
    263     vision_hidden_states,
--> 264 ) = self.get_vllm_embedding(model_inputs)            
    266 # output_ids = self._decode(input_embeds, tokenizer, **kwargs)
    267 if stream:

File /raid/vladimir_albrekht/huggingface/modules/transformers_modules/openbmb/MiniCPM-Llama3-V-2_5/320a581d2195ad4a52140bb427a07f7207aeac6e/modeling_minicpmv.py:110, in MiniCPMV.get_vllm_embedding(self, data)
    107     for i in range(B):
    108         patch_attn_mask[i, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
--> 110     vision_embedding = self.vpm(all_pixel_values.type(dtype), patch_attention_mask=patch_attn_mask).last_hidden_state
    111     vision_embedding = self.resampler(vision_embedding, tgt_sizes)
    112 else:
    113     # get vision_embedding foreach

File /raid/vladimir_albrekht/anaconda/envs/awq_quant/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
   1551     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1552 else:
-> 1553     return self._call_impl(*args, **kwargs)

File /raid/vladimir_albrekht/anaconda/envs/awq_quant/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
   1557 # If we don't have any hooks, we want to skip the rest of the logic in
   1558 # this function, and just call forward.
   1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1560         or _global_backward_pre_hooks or _global_backward_hooks
   1561         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562     return forward_call(*args, **kwargs)
   1564 try:
   1565     result = None

File /raid/vladimir_albrekht/anaconda/envs/awq_quant/lib/python3.10/site-packages/transformers/models/idefics2/modeling_idefics2.py:619, in Idefics2VisionTransformer.forward(self, pixel_values, patch_attention_mask, output_attentions, output_hidden_states, return_dict)
    610     patch_attention_mask = torch.ones(
    611         (
    612             batch_size,
   (...)
    615         )
    616     )
    617     patch_attention_mask = patch_attention_mask.to(dtype=torch.bool, device=pixel_values.device)
--> 619 hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
    621 patch_attention_mask = patch_attention_mask.view(batch_size, -1)
    622 # The call to `_upad_input` in `_flash_attention_forward` is expensive
    623 # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
    624 # avoiding passing the attention_mask, which is equivalent to attending to the full sequence

File /raid/vladimir_albrekht/anaconda/envs/awq_quant/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
   1551     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1552 else:
-> 1553     return self._call_impl(*args, **kwargs)

File /raid/vladimir_albrekht/anaconda/envs/awq_quant/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
   1557 # If we don't have any hooks, we want to skip the rest of the logic in
   1558 # this function, and just call forward.
   1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1560         or _global_backward_pre_hooks or _global_backward_hooks
   1561         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562     return forward_call(*args, **kwargs)
   1564 try:
   1565     result = None

File /raid/vladimir_albrekht/anaconda/envs/awq_quant/lib/python3.10/site-packages/transformers/models/idefics2/modeling_idefics2.py:182, in Idefics2VisionEmbeddings.forward(self, pixel_values, patch_attention_mask)
    179     bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
    181     pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
--> 182     position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
    184 position_ids = position_ids.to(self.position_embedding.weight.device)
    185 embeddings = embeddings + self.position_embedding(position_ids)

RuntimeError: shape mismatch: value tensor of shape [1024] cannot be broadcast to indexing result of shape [1025]