Spaces:
Runtime error
Runtime error
Martijn van Beers
commited on
Commit
•
dc15657
1
Parent(s):
8c7ba46
Update to blocks for layer selection
Browse filesChange from using gradio.Interface to gradio.Blocks so I understand how
to add a event handler to change the range of the layer selection when
the user chooses a different model.
- CLIP_explainability/utils.py +2 -2
- app.py +66 -56
CLIP_explainability/utils.py
CHANGED
@@ -13,13 +13,13 @@ _tokenizer = _Tokenizer()
|
|
13 |
|
14 |
#@title Control context expansion (number of attention layers to consider)
|
15 |
#@title Number of layers for image Transformer
|
16 |
-
start_layer = 11#@param {type:"number"}
|
17 |
|
18 |
#@title Number of layers for text Transformer
|
19 |
start_layer_text = 11#@param {type:"number"}
|
20 |
|
21 |
|
22 |
-
def interpret(image, texts, model, device):
|
23 |
batch_size = texts.shape[0]
|
24 |
images = image.repeat(batch_size, 1, 1, 1)
|
25 |
logits_per_image, logits_per_text = model(images, texts)
|
|
|
13 |
|
14 |
#@title Control context expansion (number of attention layers to consider)
|
15 |
#@title Number of layers for image Transformer
|
16 |
+
#start_layer = 11#@param {type:"number"}
|
17 |
|
18 |
#@title Number of layers for text Transformer
|
19 |
start_layer_text = 11#@param {type:"number"}
|
20 |
|
21 |
|
22 |
+
def interpret(image, texts, model, device, start_layer):
|
23 |
batch_size = texts.shape[0]
|
24 |
images = image.repeat(batch_size, 1, 1, 1)
|
25 |
logits_per_image, logits_per_text = model(images, texts)
|
app.py
CHANGED
@@ -53,21 +53,29 @@ import en_core_web_sm
|
|
53 |
nlp = en_core_web_sm.load()
|
54 |
|
55 |
# Gradio Section:
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
def run_demo(*args):
|
57 |
-
if len(args) ==
|
58 |
-
image, text, model_name = args
|
59 |
elif len(args) == 2:
|
60 |
image, text = args
|
61 |
model_name = "ViT-B/32"
|
|
|
62 |
else:
|
63 |
raise ValueError("Unexpected number of parameters")
|
64 |
|
|
|
65 |
model, preprocess = clip.load(model_name, device=device, jit=False)
|
66 |
orig_image = pad_to_square(image)
|
67 |
img = preprocess(orig_image).unsqueeze(0).to(device)
|
68 |
text_input = clip.tokenize([text]).to(device)
|
69 |
|
70 |
-
R_text, R_image = interpret(model=model, image=img, texts=text_input, device=device)
|
71 |
|
72 |
image_relevance = show_img_heatmap(R_image[0], img, orig_image=orig_image, device=device)
|
73 |
overlapped = overlay_relevance_map_on_image(image, image_relevance)
|
@@ -83,18 +91,6 @@ def run_demo(*args):
|
|
83 |
|
84 |
# Default demo:
|
85 |
|
86 |
-
default_inputs = [
|
87 |
-
gr.components.Image(type='pil', label="Original Image"),
|
88 |
-
gr.components.Textbox(label="Image description"),
|
89 |
-
gr.Dropdown(label="CLIP Model", choices=['ViT-B/16', 'ViT-B/32', 'ViT-L/14'], value="ViT-B/32"),
|
90 |
-
]
|
91 |
-
|
92 |
-
default_outputs = [
|
93 |
-
gr.components.Image(type='pil', label="Output Image"),
|
94 |
-
gr.components.HighlightedText(label="Text importance"),
|
95 |
-
]
|
96 |
-
|
97 |
-
|
98 |
description = """This demo is a copy of the demo CLIPGroundingExlainability built by Paul Hilders, Danilo de Goede and Piyush Bagad, as part of the course Interpretability and Explainability in AI (MSc AI, UvA, June 2022).
|
99 |
<br> <br>
|
100 |
This demo shows attributions scores on both the image and the text input when presenting CLIP with a
|
@@ -104,23 +100,37 @@ description = """This demo is a copy of the demo CLIPGroundingExlainability buil
|
|
104 |
methods such as the one from this demo can only give an estimate of the real underlying behavior
|
105 |
of the model."""
|
106 |
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
# NER demo:
|
126 |
def add_label_to_img(img, label, add_entity_label=True):
|
@@ -170,36 +180,36 @@ def NER_demo(image, text, model_name):
|
|
170 |
|
171 |
return labeled_text, gallery_images
|
172 |
|
173 |
-
inputs_NER = [
|
174 |
-
gr.Image(type='pil', label="Original Image"),
|
175 |
-
gr.components.Textbox(label="Descriptive text"),
|
176 |
-
gr.Dropdown(label="CLIP Model", choices=['ViT-B/16', 'ViT-B/32', 'ViT-L/14'], value="ViT-L/14"),
|
177 |
-
]
|
178 |
-
|
179 |
-
#colours = highlighter._style["color_map"]
|
180 |
-
outputs_NER = [
|
181 |
-
gr.components.HighlightedText(show_legend=True, color_map=colour_map, label="Noun chunks"),
|
182 |
-
gr.components.Gallery(type='pil', label="NER Entity explanations")
|
183 |
-
]
|
184 |
|
185 |
description_NER = """Automatically generated CLIP grounding explanations for
|
186 |
-
|
187 |
that attribution methods such as the one from this demo can only give an estimate of the real
|
188 |
underlying behavior of the model."""
|
189 |
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
|
|
|
203 |
with demo_tabs:
|
204 |
gr.Markdown("""
|
205 |
### Acknowledgements
|
|
|
53 |
nlp = en_core_web_sm.load()
|
54 |
|
55 |
# Gradio Section:
|
56 |
+
def update_slider(model):
|
57 |
+
if model == "ViT-L/14":
|
58 |
+
return gr.update(maximum=23, value=23)
|
59 |
+
else:
|
60 |
+
return gr.update(maximum=11, value=11)
|
61 |
+
|
62 |
def run_demo(*args):
|
63 |
+
if len(args) == 4:
|
64 |
+
image, text, model_name, vision_layer = args
|
65 |
elif len(args) == 2:
|
66 |
image, text = args
|
67 |
model_name = "ViT-B/32"
|
68 |
+
vision_layer = 11
|
69 |
else:
|
70 |
raise ValueError("Unexpected number of parameters")
|
71 |
|
72 |
+
vision_layer = int(vision_layer)
|
73 |
model, preprocess = clip.load(model_name, device=device, jit=False)
|
74 |
orig_image = pad_to_square(image)
|
75 |
img = preprocess(orig_image).unsqueeze(0).to(device)
|
76 |
text_input = clip.tokenize([text]).to(device)
|
77 |
|
78 |
+
R_text, R_image = interpret(model=model, image=img, texts=text_input, device=device, start_layer=vision_layer)
|
79 |
|
80 |
image_relevance = show_img_heatmap(R_image[0], img, orig_image=orig_image, device=device)
|
81 |
overlapped = overlay_relevance_map_on_image(image, image_relevance)
|
|
|
91 |
|
92 |
# Default demo:
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
description = """This demo is a copy of the demo CLIPGroundingExlainability built by Paul Hilders, Danilo de Goede and Piyush Bagad, as part of the course Interpretability and Explainability in AI (MSc AI, UvA, June 2022).
|
95 |
<br> <br>
|
96 |
This demo shows attributions scores on both the image and the text input when presenting CLIP with a
|
|
|
100 |
methods such as the one from this demo can only give an estimate of the real underlying behavior
|
101 |
of the model."""
|
102 |
|
103 |
+
with gr.Blocks(title="CLIP Grounding Explainability") as iface_default:
|
104 |
+
gr.Markdown(description)
|
105 |
+
with gr.Row():
|
106 |
+
with gr.Column() as inputs:
|
107 |
+
orig = gr.components.Image(type='pil', label="Original Image")
|
108 |
+
description = gr.components.Textbox(label="Image description")
|
109 |
+
default_model = gr.Dropdown(label="CLIP Model", choices=['ViT-B/16', 'ViT-B/32', 'ViT-L/14'], value="ViT-B/32")
|
110 |
+
default_layer = gr.Slider(label="Vision start layer", minimum=0, maximum=11, step=1, value=11)
|
111 |
+
submit = gr.Button("Submit")
|
112 |
+
with gr.Column() as outputs:
|
113 |
+
image = gr.components.Image(type='pil', label="Output Image")
|
114 |
+
text = gr.components.HighlightedText(label="Text importance")
|
115 |
+
gr.Examples(
|
116 |
+
examples=[
|
117 |
+
["example_images/London.png", "London Eye"],
|
118 |
+
["example_images/London.png", "Big Ben"],
|
119 |
+
["example_images/harrypotter.png", "Harry"],
|
120 |
+
["example_images/harrypotter.png", "Hermione"],
|
121 |
+
["example_images/harrypotter.png", "Ron"],
|
122 |
+
["example_images/Amsterdam.png", "Amsterdam canal"],
|
123 |
+
["example_images/Amsterdam.png", "Old buildings"],
|
124 |
+
["example_images/Amsterdam.png", "Pink flowers"],
|
125 |
+
["example_images/dogs_on_bed.png", "Two dogs"],
|
126 |
+
["example_images/dogs_on_bed.png", "Book"],
|
127 |
+
["example_images/dogs_on_bed.png", "Cat"]
|
128 |
+
],
|
129 |
+
inputs=[orig, description]
|
130 |
+
)
|
131 |
+
default_model.change(update_slider, inputs=default_model, outputs=default_layer)
|
132 |
+
submit.click(run_demo, inputs=[orig, description, default_model, default_layer], outputs=[image, text])
|
133 |
+
|
134 |
|
135 |
# NER demo:
|
136 |
def add_label_to_img(img, label, add_entity_label=True):
|
|
|
180 |
|
181 |
return labeled_text, gallery_images
|
182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
|
184 |
description_NER = """Automatically generated CLIP grounding explanations for
|
185 |
+
noun chunks, retrieved with the spaCy model. <span style="color:red">Warning:</span> Note
|
186 |
that attribution methods such as the one from this demo can only give an estimate of the real
|
187 |
underlying behavior of the model."""
|
188 |
|
189 |
+
with gr.Blocks(title="Entity Grounding explainability using CLIP") as iface_NER:
|
190 |
+
gr.Markdown(description_NER)
|
191 |
+
with gr.Row():
|
192 |
+
with gr.Column() as inputs:
|
193 |
+
img = gr.Image(type='pil', label="Original Image")
|
194 |
+
text = gr.components.Textbox(label="Descriptive text")
|
195 |
+
ner_model = gr.Dropdown(label="CLIP Model", choices=['ViT-B/16', 'ViT-B/32', 'ViT-L/14'], value="ViT-B/32")
|
196 |
+
ner_layer = gr.Slider(label="Vision start layer", minimum=0, maximum=11, step=1, value=11)
|
197 |
+
submit = gr.Button("Submit")
|
198 |
+
with gr.Column() as outputs:
|
199 |
+
text = gr.components.HighlightedText(show_legend=True, color_map=colour_map, label="Noun chunks")
|
200 |
+
gallery = gr.components.Gallery(type='pil', label="NER Entity explanations")
|
201 |
+
|
202 |
+
gr.Examples(
|
203 |
+
examples=[
|
204 |
+
["example_images/London.png", "In this image we see Big Ben and the London Eye, on both sides of the river Thames."],
|
205 |
+
["example_images/harrypotter.png", "Hermione, Harry and Ron in their school uniform"],
|
206 |
+
],
|
207 |
+
inputs=[img, text],
|
208 |
+
)
|
209 |
+
ner_model.change(update_slider, inputs=ner_model, outputs=ner_layer)
|
210 |
+
submit.click(run_demo, inputs=[img, text, ner_model, ner_layer], outputs=[text, gallery])
|
211 |
|
212 |
+
demo_tabs = gr.TabbedInterface([iface_default, iface_NER], ["Default", "Entities"])
|
213 |
with demo_tabs:
|
214 |
gr.Markdown("""
|
215 |
### Acknowledgements
|