from transformers import CLIPProcessor, CLIPModel import gradio as gr model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") classes = ["Iron Man", "Captain America", "Thor", "Spider-Man", "Black Widow", "Black Panther","Hulk", "Ant-Man", 'Peggy Carter', "Daredevil", "Star-Lord", "Wong", "Doctor Strange","Nick Fury", "Gamora", "Jessica Jones", "Nebula", "Falcon", "Winter Soldier", "Rocket", "Hawkeye"] text = [f"a photo of {x}" for x in classes] def predict(img): inputs = processor(text=text, images=img, return_tensors="pt", padding=True) outputs = model(**inputs) logits_per_image = outputs.logits_per_image # this is the image-text similarity score probs = logits_per_image.softmax(dim=1).squeeze() # we can take the softmax to get the label probabilities return {classes[i] : float(probs[i]) for i in range(len(probs))} title = "Marvel Heroes Classification" description = "Using clip for zero-shot classification" examples = ["black_panter.jpg"] gr.Interface(fn=predict, inputs = gr.inputs.Image(shape = (512,512)), outputs= gr.outputs.Label(), examples=examples, title=title, description=description).launch(inline=False)