from datasets import load_dataset from transformers import CLIPProcessor dataset = load_dataset('coco', split='train') processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") def preprocess_data(examples): inputs = processor(text=examples["caption"], images=examples["image"], return_tensors="pt", padding=True) return inputs dataset = dataset.map(preprocess_data)