merve HF staff commited on
Commit
fd70df6
·
verified ·
1 Parent(s): 725e53c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -15
app.py CHANGED
@@ -1,21 +1,87 @@
1
- from transformers import AutoModel, CLIPImageProcessor, CLIPTokenizer
 
 
 
 
2
  import torch
 
 
3
  import spaces
4
 
5
 
6
- model_name_or_path = "BAAI/EVA-CLIP-8B"
7
- image_size = 224
8
 
9
- def load_model():
10
- processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
11
-
12
- model = AutoModel.from_pretrained(
13
- model_name_or_path,
14
- torch_dtype=torch.bfloat16,
15
- trust_remote_code=True).to('cuda').eval()
16
-
17
-
18
- tokenizer = CLIPTokenizer.from_pretrained(model_name_or_path)
19
- return model, tokenizer, processor
20
 
21
- load_model()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, CLIPImageProcessorProcessor, AutoProcessor, pipeline, CLIPTokenizer
2
+ import torchvision.transforms as T
3
+ import torch.nn.functional as F
4
+ from PIL import Image, ImageFile
5
+ import requests
6
  import torch
7
+ import numpy as np
8
+ import gradio as gr
9
  import spaces
10
 
11
 
12
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
13
+ model_name_or_path = "BAAI/EVA-CLIP-8B"
14
 
15
+ processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
 
 
 
 
 
 
 
 
 
 
16
 
17
+ model = AutoModel.from_pretrained(
18
+ model_name_or_path,
19
+ torch_dtype=torch.bfloat16,
20
+ trust_remote_code=True).to(device).eval()
21
+
22
+
23
+ tokenizer = CLIPTokenizer.from_pretrained(model_name_or_path)
24
+
25
+
26
+ clip_checkpoint = "openai/clip-vit-base-patch16"
27
+ clip_detector = pipeline(model=clip_checkpoint, task="zero-shot-image-classification", device=device)
28
+
29
+
30
+ def infer_evaclip(image, captions):
31
+ captions = captions.split(",")
32
+ input_ids = tokenizer(captions, return_tensors="pt", padding=True).input_ids.to('cuda')
33
+ input_pixels = processor(images=image, return_tensors="pt", padding=True).pixel_values.to('cuda')
34
+
35
+
36
+ with torch.no_grad(), torch.cuda.amp.autocast():
37
+ image_features = model.encode_image(input_pixels)
38
+ text_features = model.encode_text(input_ids)
39
+ image_features /= image_features.norm(dim=-1, keepdim=True)
40
+ text_features /= text_features.norm(dim=-1, keepdim=True)
41
+
42
+ label_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
43
+ label_probs = label_probs.cpu().numpy().tolist()[0]
44
+ print(captions)
45
+ print(label_probs)
46
+ return {captions[i]: label_probs[i] for i in range(len(captions))}
47
+
48
+ def clip_inference(image, labels):
49
+ candidate_labels = [label.lstrip(" ") for label in labels.split(",")]
50
+ clip_out = clip_detector(image, candidate_labels=candidate_labels)
51
+ return {out["label"]: float(out["score"]) for out in clip_out}
52
+
53
+ @spaces.GPU
54
+ def infer(image, labels):
55
+ clip_out = clip_inference(image, labels)
56
+ evaclip_out = infer_evaclip(image, labels)
57
+
58
+ return clip_out, evaclip_out
59
+
60
+
61
+ with gr.Blocks() as demo:
62
+ gr.Markdown("# EVACLIP vs CLIP 💥 ")
63
+ with gr.Row():
64
+ with gr.Column():
65
+ image_input = gr.Image(type="pil")
66
+ text_input = gr.Textbox(label="Input a list of labels")
67
+ run_button = gr.Button("Run", visible=True)
68
+
69
+ with gr.Column():
70
+ clip_output = gr.Label(label = "CLIP Output", num_top_classes=3)
71
+ evaclip_output = gr.Label(label = "EVA-CLIP Output", num_top_classes=3)
72
+
73
+ examples = [["./cat.png", "cat on a table, cat on a tree"]]
74
+ gr.Examples(
75
+ examples = examples,
76
+ inputs=[image_input, text_input],
77
+ outputs=[clip_output,
78
+ evaclip_output],
79
+ fn=infer,
80
+ cache_examples=True
81
+ )
82
+ run_button.click(fn=infer,
83
+ inputs=[image_input, text_input],
84
+ outputs=[clip_output,
85
+ evaclip_output])
86
+
87
+ demo.launch()