Spaces:
Runtime error
Runtime error
Martijn van Beers
commited on
Commit
•
1fd86da
1
Parent(s):
0c9f8df
Add model selection
Browse filesAdd an extra input to be able to select which CLIP model to use.
app.py
CHANGED
@@ -47,14 +47,15 @@ colour_map = {
|
|
47 |
}
|
48 |
|
49 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
50 |
-
model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
|
51 |
|
52 |
# nlp = spacy.load("en_core_web_sm")
|
53 |
import en_core_web_sm
|
54 |
nlp = en_core_web_sm.load()
|
55 |
|
56 |
# Gradio Section:
|
57 |
-
def run_demo(image, text):
|
|
|
|
|
58 |
orig_image = pad_to_square(image)
|
59 |
img = preprocess(orig_image).unsqueeze(0).to(device)
|
60 |
text_input = clip.tokenize([text]).to(device)
|
@@ -76,6 +77,7 @@ def run_demo(image, text):
|
|
76 |
# Default demo:
|
77 |
|
78 |
default_inputs = [
|
|
|
79 |
gr.components.Image(type='pil', label="Original Image"),
|
80 |
gr.components.Textbox(label="Image description"),
|
81 |
]
|
@@ -100,17 +102,17 @@ iface = gr.Interface(fn=run_demo,
|
|
100 |
outputs=default_outputs,
|
101 |
title="CLIP Grounding Explainability",
|
102 |
description=description,
|
103 |
-
examples=[["example_images/London.png", "London Eye"],
|
104 |
-
["example_images/London.png", "Big Ben"],
|
105 |
-
["example_images/harrypotter.png", "Harry"],
|
106 |
-
["example_images/harrypotter.png", "Hermione"],
|
107 |
-
["example_images/harrypotter.png", "Ron"],
|
108 |
-
["example_images/Amsterdam.png", "Amsterdam canal"],
|
109 |
-
["example_images/Amsterdam.png", "Old buildings"],
|
110 |
-
["example_images/Amsterdam.png", "Pink flowers"],
|
111 |
-
["example_images/dogs_on_bed.png", "Two dogs"],
|
112 |
-
["example_images/dogs_on_bed.png", "Book"],
|
113 |
-
["example_images/dogs_on_bed.png", "Cat"]])
|
114 |
|
115 |
# NER demo:
|
116 |
def add_label_to_img(img, label, add_entity_label=True):
|
@@ -160,6 +162,7 @@ def NER_demo(image, text):
|
|
160 |
return labeled_text, gallery_images
|
161 |
|
162 |
inputs_NER = [
|
|
|
163 |
gr.Image(type='pil', label="Original Image"),
|
164 |
gr.components.Textbox(label="Descriptive text"),
|
165 |
]
|
@@ -181,8 +184,8 @@ iface_NER = gr.Interface(fn=NER_demo,
|
|
181 |
title="Named Entity Grounding explainability using CLIP",
|
182 |
description=description_NER,
|
183 |
examples=[
|
184 |
-
["example_images/London.png", "In this image we see Big Ben and the London Eye, on both sides of the river Thames."],
|
185 |
-
["example_images/harrypotter.png", "Hermione, Harry and Ron in their school uniform"],
|
186 |
],
|
187 |
cache_examples=False)
|
188 |
|
|
|
47 |
}
|
48 |
|
49 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
50 |
|
51 |
# nlp = spacy.load("en_core_web_sm")
|
52 |
import en_core_web_sm
|
53 |
nlp = en_core_web_sm.load()
|
54 |
|
55 |
# Gradio Section:
|
56 |
+
def run_demo(model_name, image, text):
|
57 |
+
|
58 |
+
model, preprocess = clip.load(model_name, device=device, jit=False)
|
59 |
orig_image = pad_to_square(image)
|
60 |
img = preprocess(orig_image).unsqueeze(0).to(device)
|
61 |
text_input = clip.tokenize([text]).to(device)
|
|
|
77 |
# Default demo:
|
78 |
|
79 |
default_inputs = [
|
80 |
+
gr.Dropdown(label="CLIP Model", choices=['ViT-B/16', 'ViT-B/32', 'ViT-L/14'], value="ViT-L/14"),
|
81 |
gr.components.Image(type='pil', label="Original Image"),
|
82 |
gr.components.Textbox(label="Image description"),
|
83 |
]
|
|
|
102 |
outputs=default_outputs,
|
103 |
title="CLIP Grounding Explainability",
|
104 |
description=description,
|
105 |
+
examples=[[None, "example_images/London.png", "London Eye"],
|
106 |
+
[None, "example_images/London.png", "Big Ben"],
|
107 |
+
[None, "example_images/harrypotter.png", "Harry"],
|
108 |
+
[None, "example_images/harrypotter.png", "Hermione"],
|
109 |
+
[None, "example_images/harrypotter.png", "Ron"],
|
110 |
+
[None, "example_images/Amsterdam.png", "Amsterdam canal"],
|
111 |
+
[None, "example_images/Amsterdam.png", "Old buildings"],
|
112 |
+
[None, "example_images/Amsterdam.png", "Pink flowers"],
|
113 |
+
[None, "example_images/dogs_on_bed.png", "Two dogs"],
|
114 |
+
[None, "example_images/dogs_on_bed.png", "Book"],
|
115 |
+
[None, "example_images/dogs_on_bed.png", "Cat"]])
|
116 |
|
117 |
# NER demo:
|
118 |
def add_label_to_img(img, label, add_entity_label=True):
|
|
|
162 |
return labeled_text, gallery_images
|
163 |
|
164 |
inputs_NER = [
|
165 |
+
gr.Dropdown(label="CLIP Model", choices=['ViT-B/16', 'ViT-B/32', 'ViT-L/14'], value="ViT-L/14"),
|
166 |
gr.Image(type='pil', label="Original Image"),
|
167 |
gr.components.Textbox(label="Descriptive text"),
|
168 |
]
|
|
|
184 |
title="Named Entity Grounding explainability using CLIP",
|
185 |
description=description_NER,
|
186 |
examples=[
|
187 |
+
[None, "example_images/London.png", "In this image we see Big Ben and the London Eye, on both sides of the river Thames."],
|
188 |
+
[None, "example_images/harrypotter.png", "Hermione, Harry and Ron in their school uniform"],
|
189 |
],
|
190 |
cache_examples=False)
|
191 |
|