from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer import torch from PIL import Image import gradio as gr model_name = "aryan083/vit-gpt2-image-captioning" model = VisionEncoderDecoderModel.from_pretrained(model_name) feature_extractor = ViTImageProcessor.from_pretrained(model_name) # Changed from ViTFeatureExtractor to ViTImageProcessor tokenizer = AutoTokenizer.from_pretrained(model_name) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) def predict_caption(image): if image is None: return None images = [] images.append(image) pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values pixel_values = pixel_values.to(device) output_ids = model.generate( pixel_values, do_sample=True, max_length=16, num_beams=4, temperature=0.7 ) preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) return preds[0].strip() # Create Gradio interface iface = gr.Interface( fn=predict_caption, inputs=gr.Image(type="pil"), outputs=gr.Textbox(label="Generated Caption"), title="Image Captioning", description="Upload an image and get its description generated using ViT-GPT2", # examples=[["assets/example1.jpg"]] # Add example images if you have any ) iface.launch()