Issue with Trainer Training - AttributeError: 'NoneType' object has no attribute 'shape'
Description:
I am working on fine-tuning an image-text model using the Hugging Face AutoModelForImageTextToText and LlavaProcessor. While attempting to train the model using the SFTTrainer, I encountered an error related to a NoneType object during the training loop. The error occurs specifically in the _merge_input_ids_with_image_features method in the modeling_llava.py file.
Note:
I have load the data(json) from my GDrive
Error Details:
AttributeError: 'NoneType' object has no attribute 'shape'
Error Occurrence:
The error occurs after calling trainer.train(), and it seems that during the training, the image_features passed into the _merge_input_ids_with_image_features function is None, causing the AttributeError when the code tries to access its shape.
Code Snippet Leading to the Error:
trainer = SFTTrainer(
model=model,
train_dataset=train_dataset,
peft_config=peft_config,
tokenizer=tokenizer,
args=sft_config,
)
Train model
trainer.train()
Relevant Model Function:
The error occurs within the following function in modeling_llava.py:
def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask, labels):
num_images, num_image_patches, embed_dim = image_features.shape # Error here
batch_size, sequence_length = input_ids.shape
# Further processing...
Potential Causes:
image_features might not be properly processed or passed to the model.
The image preprocessing function might not return the correct features, or the dataset might not have the expected structure.
Request:
Could you help me troubleshoot this issue and suggest how to fix the NoneType error? Specifically:
What might cause the image_features variable to be None?
How can I ensure that image_features is properly populated and passed to the model?
Code:
Load the base model
Load base model
model = AutoModelForImageTextToText.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1
Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
processor = LlavaProcessor.from_pretrained(model_name)
Prompt Template
conversation = [
{
"role": "user",
"content": [
{"type": "text", "text": "is there any fracture"},
{"type": "image"},
],
},
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
Reload the Dataset from Google Drive
import json
Load the dataset from Google Drive
with open("/content/drive/MyDrive/fineTune model1/sub_datset1.json", "r", encoding="utf-8") as f:
reloaded_dataset1 = json.load(f)
with open("/content/drive/MyDrive/fineTune model1/sub_datset2.json", "r", encoding="utf-8") as f:
reloaded_dataset2 = json.load(f)
with open("/content/drive/MyDrive/fineTune model1/sub_datset3.json", "r", encoding="utf-8") as f:
reloaded_dataset3 = json.load(f)
Converting to Hugging Face Dataset
from datasets import Dataset
Convert the reformatted data into a Hugging Face Dataset
hf_dataset1 = Dataset.from_dict({
"image": [item["image"] for item in reloaded_dataset1],
"question": [item["question"] for item in reloaded_dataset1],
"answer": [item["answer"] for item in reloaded_dataset1]
})
Convert the reformatted data into a Hugging Face Dataset
hf_dataset2 = Dataset.from_dict({
"image": [item["image"] for item in reloaded_dataset2],
"question": [item["question"] for item in reloaded_dataset2],
"answer": [item["answer"] for item in reloaded_dataset2]
})
Convert the reformatted data into a Hugging Face Dataset
hf_dataset3 = Dataset.from_dict({
"image": [item["image"] for item in reloaded_dataset3],
"question": [item["question"] for item in reloaded_dataset3],
"answer": [item["answer"] for item in reloaded_dataset3]
})
Merge the dataset
from datasets import concatenate_datasets
Concatenate the datasets
merged_dataset1 = concatenate_datasets([hf_dataset1, hf_dataset2, hf_dataset3])
Print the size of the merged dataset
print(len(merged_dataset1))
Split the Dataset into Train, Validation, and Test
Import the required function from the sklearn.model_selection module
from sklearn.model_selection import train_test_split
from datasets import DatasetDict
Convert Hugging Face Dataset to Pandas DataFrame for splitting
df = merged_dataset1.to_pandas()
Split into train (80%) and temp (20%)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
Split temp into validation (10%) and test (10%)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}")
Convert back to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)
Create DatasetDict
final_dataset = DatasetDict({
"train": train_dataset,
"validation": val_dataset,
"test": test_dataset
})
Save the datasets
final_dataset["train"].to_json("/content/drive/MyDrive/fineTune model1/train1.json")
final_dataset["validation"].to_json("/content/drive/MyDrive/fineTune model1/validation1.json")
final_dataset["test"].to_json("/content/drive/MyDrive/fineTune model1/test1.json")
print("Dataset split and saved successfully!")
Preprocessing Function
from PIL import Image
import base64
from io import BytesIO
import torch
Define your preprocess function
def preprocess_function(samples):
# # Debugging: Print the type and first image entry in the batch
# print(f"Type of samples['image']: {type(samples['image'])}")
# print(f"First image entry (base64): {samples['image'][0]}")
# Initialize an empty list for images
images = []
# Decode and process each image
for img_data in samples["image"]:
if isinstance(img_data, str): # Assuming base64 encoding
try:
# Decode the image from base64 and convert to RGB
img = Image.open(BytesIO(base64.b64decode(img_data))).convert("RGB")
except Exception as e:
print(f"Error loading base64 image: {e}")
img = None
elif isinstance(img_data, Image.Image): # If it's already a PIL Image object
img = img_data.convert("RGB")
else:
print(f"Unsupported image type: {type(img_data)}")
img = None
if img is not None:
images.append(img)
else:
print("Image could not be processed or is None.")
# Now, process the question and images using your processor
inputs = processor(
text=samples["question"],
images=images,
return_tensors="pt",
padding="max_length",
truncation=True,
max_length=512
)
# Ensure the processor tokenizes the answer correctly
labels = processor.tokenizer(
text=samples["answer"],
return_tensors="pt",
padding="max_length",
truncation=True,
max_length=512
)["input_ids"]
# Add labels to the input dictionary
inputs["labels"] = torch.tensor(labels)
# Debugging: Check if pixel_values is present and has the correct shape
print(f"Inputs dictionary: {inputs.keys()}")
if "pixel_values" in inputs:
print(f"Shape of pixel_values: {inputs['pixel_values'].shape}")
else:
print("pixel_values not found in inputs.")
return inputs
Data preprocessing
train_dataset = final_dataset['train']
test_dataset = final_dataset['test']
eval_dataset=final_dataset["validation"]
print(train_dataset.column_names)
#output: ['image', 'question', 'answer', 'index_level_0']
Apply preprocessing function without removing columns
train_dataset = train_dataset.map(preprocess_function)
Now remove the unnecessary columns
train_dataset = train_dataset.remove_columns(["image", "question", "answer", "index_level_0"])
Set the format to PyTorch tensors
train_dataset.set_format(type="torch")
test_dataset = test_dataset.map(preprocess_function, remove_columns=["image", "question", "answer"])
test_dataset.set_format(type="torch")
eval_dataset=eval_dataset.map(preprocess_function, remove_columns=["image", "question", "answer"])
eval_dataset.set_format(type="torch")
print(train_dataset.column_names)
#output: ['input_ids', 'attention_mask', 'pixel_values', 'labels']
Prepare for finetuning
from trl import SFTConfig
from trl.trainer.utils import ConstantLengthDataset
Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
major, _ = torch.cuda.get_device_capability()
if major >= 8:
print("=" * 80)
print("Your GPU supports bfloat16: accelerate training with bf16=True")
print("=" * 80)
Load LoRA configuration
peft_config = LoraConfig(
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
r=lora_r,
bias="none",
task_type="CAUSAL_LM",
target_modules=["q_proj", "v_proj","k_proj", "o_proj"]
)
sft_config = SFTConfig(
# SFT-specific settings
max_seq_length=max_seq_length,
dataset_text_field="text",
output_dir=output_dir,
num_train_epochs=num_train_epochs,
per_device_train_batch_size=per_device_train_batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
optim=optim,
save_steps=save_steps,
logging_steps=logging_steps,
learning_rate=learning_rate,
weight_decay=weight_decay,
fp16=fp16,
bf16=bf16,
max_grad_norm=max_grad_norm,
max_steps=max_steps,
warmup_ratio=warmup_ratio,
group_by_length=False,
lr_scheduler_type=lr_scheduler_type,
report_to="tensorboard",
)
tokenizer.chat_template = "default"
def formatting_func(example):
if isinstance(example["input_ids"], torch.Tensor):
return example["input_ids"].squeeze().tolist()
elif isinstance(example["input_ids"], list): # Check if it's already a list
return example["input_ids"] # Return as is
elif isinstance(example["input_ids"], dict): # Check if it's a dictionary
return example["input_ids"].get("input_ids", []) # Attempt to extract input_ids if it's a dictionary
else:
return [] # Return an empty list in other cases
train_dataset = ConstantLengthDataset(
tokenizer,
train_dataset,
formatting_func=formatting_func,
seq_length=128,
)
trainer = SFTTrainer(
model=model,
train_dataset=train_dataset,
peft_config=peft_config,
tokenizer=tokenizer,
args=sft_config,
)
Train model
trainer.train()