Spaces:
Runtime error
Runtime error
Martijn van Beers
commited on
Commit
•
adf3a47
1
Parent(s):
6395dfb
Move text and examples into separate files
Browse files- app.py +15 -52
- description.md +9 -0
- entity_description.md +5 -0
- entity_examples.csv +3 -0
- examples.csv +12 -0
- footer.md +13 -0
app.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
import re
|
2 |
import sys
|
|
|
|
|
3 |
import gradio as gr
|
4 |
|
5 |
-
# sys.path.append("../")
|
6 |
sys.path.append("CLIP_explainability/Transformer-MM-Explainability/")
|
7 |
|
8 |
import torch
|
@@ -24,6 +25,12 @@ clip.clip._MODELS = {
|
|
24 |
"ViT-L/14@336px": "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt",
|
25 |
}
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
colour_map = {
|
28 |
"N": "#f77189",
|
29 |
"CARDINAL": "#f7764a",
|
@@ -91,17 +98,9 @@ def run_demo(*args):
|
|
91 |
|
92 |
# Default demo:
|
93 |
|
94 |
-
|
95 |
-
<br> <br>
|
96 |
-
This demo shows attributions scores on both the image and the text input when presenting CLIP with a
|
97 |
-
<text,image> pair. Attributions are computed as Gradient-weighted Attention Rollout (Chefer et al.,
|
98 |
-
2021), and can be thought of as an estimate of the effective attention CLIP pays to its input when
|
99 |
-
computing a multimodal representation. <span style="color:red">Warning:</span> Note that attribution
|
100 |
-
methods such as the one from this demo can only give an estimate of the real underlying behavior
|
101 |
-
of the model."""
|
102 |
-
|
103 |
with gr.Blocks(title="CLIP Grounding Explainability") as iface_default:
|
104 |
-
gr.Markdown(description)
|
105 |
with gr.Row():
|
106 |
with gr.Column() as inputs:
|
107 |
orig = gr.components.Image(type='pil', label="Original Image")
|
@@ -112,22 +111,7 @@ with gr.Blocks(title="CLIP Grounding Explainability") as iface_default:
|
|
112 |
with gr.Column() as outputs:
|
113 |
image = gr.components.Image(type='pil', label="Output Image")
|
114 |
text = gr.components.HighlightedText(label="Text importance")
|
115 |
-
gr.Examples(
|
116 |
-
examples=[
|
117 |
-
["example_images/London.png", "London Eye"],
|
118 |
-
["example_images/London.png", "Big Ben"],
|
119 |
-
["example_images/harrypotter.png", "Harry"],
|
120 |
-
["example_images/harrypotter.png", "Hermione"],
|
121 |
-
["example_images/harrypotter.png", "Ron"],
|
122 |
-
["example_images/Amsterdam.png", "Amsterdam canal"],
|
123 |
-
["example_images/Amsterdam.png", "Old buildings"],
|
124 |
-
["example_images/Amsterdam.png", "Pink flowers"],
|
125 |
-
["example_images/dogs_on_bed.png", "Two dogs"],
|
126 |
-
["example_images/dogs_on_bed.png", "Book"],
|
127 |
-
["example_images/dogs_on_bed.png", "Cat"]
|
128 |
-
],
|
129 |
-
inputs=[orig, description]
|
130 |
-
)
|
131 |
default_model.change(update_slider, inputs=default_model, outputs=default_layer)
|
132 |
submit.click(run_demo, inputs=[orig, description, default_model, default_layer], outputs=[image, text])
|
133 |
|
@@ -181,13 +165,9 @@ def NER_demo(image, text, model_name):
|
|
181 |
return labeled_text, gallery_images
|
182 |
|
183 |
|
184 |
-
|
185 |
-
noun chunks, retrieved with the spaCy model. <span style="color:red">Warning:</span> Note
|
186 |
-
that attribution methods such as the one from this demo can only give an estimate of the real
|
187 |
-
underlying behavior of the model."""
|
188 |
-
|
189 |
with gr.Blocks(title="Entity Grounding explainability using CLIP") as iface_NER:
|
190 |
-
gr.Markdown(
|
191 |
with gr.Row():
|
192 |
with gr.Column() as inputs:
|
193 |
img = gr.Image(type='pil', label="Original Image")
|
@@ -199,28 +179,11 @@ with gr.Blocks(title="Entity Grounding explainability using CLIP") as iface_NER:
|
|
199 |
text = gr.components.HighlightedText(show_legend=True, color_map=colour_map, label="Noun chunks")
|
200 |
gallery = gr.components.Gallery(type='pil', label="NER Entity explanations")
|
201 |
|
202 |
-
gr.Examples(
|
203 |
-
examples=[
|
204 |
-
["example_images/London.png", "In this image we see Big Ben and the London Eye, on both sides of the river Thames."],
|
205 |
-
["example_images/harrypotter.png", "Hermione, Harry and Ron in their school uniform"],
|
206 |
-
],
|
207 |
-
inputs=[img, text],
|
208 |
-
)
|
209 |
ner_model.change(update_slider, inputs=ner_model, outputs=ner_layer)
|
210 |
submit.click(run_demo, inputs=[img, intext, ner_model, ner_layer], outputs=[text, gallery])
|
211 |
|
212 |
demo_tabs = gr.TabbedInterface([iface_default, iface_NER], ["Default", "Entities"])
|
213 |
with demo_tabs:
|
214 |
-
gr.Markdown(""
|
215 |
-
### Acknowledgements
|
216 |
-
This demo was developed for the Interpretability & Explainability in AI course at the University of
|
217 |
-
Amsterdam. We would like to express our thanks to Jelle Zuidema, Jaap Jumelet, Tom Kersten, Christos
|
218 |
-
Athanasiadis, Peter Heemskerk, Zhi Zhang, and all the other TAs who helped us during this course.
|
219 |
-
|
220 |
-
---
|
221 |
-
### References
|
222 |
-
\[1\]: Chefer, H., Gur, S., & Wolf, L. (2021). Generic attention-model explainability for interpreting bi-modal and encoder-decoder transformers. <br>
|
223 |
-
\[2\]: Abnar, S., & Zuidema, W. (2020). Quantifying attention flow in transformers. arXiv preprint arXiv:2005.00928. <br>
|
224 |
-
\[3\]: [https://samiraabnar.github.io/articles/2020-04/attention_flow](https://samiraabnar.github.io/articles/2020-04/attention_flow) <br>
|
225 |
-
""")
|
226 |
demo_tabs.launch(show_error=True)
|
|
|
1 |
import re
|
2 |
import sys
|
3 |
+
import pathlib
|
4 |
+
import csv
|
5 |
import gradio as gr
|
6 |
|
|
|
7 |
sys.path.append("CLIP_explainability/Transformer-MM-Explainability/")
|
8 |
|
9 |
import torch
|
|
|
25 |
"ViT-L/14@336px": "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt",
|
26 |
}
|
27 |
|
28 |
+
def iter_file(filename):
|
29 |
+
with pathlib.Path(filename).open("r") as fh:
|
30 |
+
header = next(fh)
|
31 |
+
for line in fh:
|
32 |
+
yield line
|
33 |
+
|
34 |
colour_map = {
|
35 |
"N": "#f77189",
|
36 |
"CARDINAL": "#f7764a",
|
|
|
98 |
|
99 |
# Default demo:
|
100 |
|
101 |
+
examples = list(csv.reader(iter_file("examples.csv")))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
with gr.Blocks(title="CLIP Grounding Explainability") as iface_default:
|
103 |
+
gr.Markdown(pathlib.Path("description.md").read_text)
|
104 |
with gr.Row():
|
105 |
with gr.Column() as inputs:
|
106 |
orig = gr.components.Image(type='pil', label="Original Image")
|
|
|
111 |
with gr.Column() as outputs:
|
112 |
image = gr.components.Image(type='pil', label="Output Image")
|
113 |
text = gr.components.HighlightedText(label="Text importance")
|
114 |
+
gr.Examples(examples=examples, inputs=[orig, description])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
default_model.change(update_slider, inputs=default_model, outputs=default_layer)
|
116 |
submit.click(run_demo, inputs=[orig, description, default_model, default_layer], outputs=[image, text])
|
117 |
|
|
|
165 |
return labeled_text, gallery_images
|
166 |
|
167 |
|
168 |
+
entity_examples = list(csv.reader(iter_file("entity_examples.csv")))
|
|
|
|
|
|
|
|
|
169 |
with gr.Blocks(title="Entity Grounding explainability using CLIP") as iface_NER:
|
170 |
+
gr.Markdown(pathlib.Path("entity_description.md").read_text)
|
171 |
with gr.Row():
|
172 |
with gr.Column() as inputs:
|
173 |
img = gr.Image(type='pil', label="Original Image")
|
|
|
179 |
text = gr.components.HighlightedText(show_legend=True, color_map=colour_map, label="Noun chunks")
|
180 |
gallery = gr.components.Gallery(type='pil', label="NER Entity explanations")
|
181 |
|
182 |
+
gr.Examples(examples=entity_examples, inputs=[img, text])
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
ner_model.change(update_slider, inputs=ner_model, outputs=ner_layer)
|
184 |
submit.click(run_demo, inputs=[img, intext, ner_model, ner_layer], outputs=[text, gallery])
|
185 |
|
186 |
demo_tabs = gr.TabbedInterface([iface_default, iface_NER], ["Default", "Entities"])
|
187 |
with demo_tabs:
|
188 |
+
gr.Markdown(pathlib.Path("footer.md").read_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
demo_tabs.launch(show_error=True)
|
description.md
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
This demo is a copy of the demo CLIPGroundingExlainability built by Paul Hilders, Danilo de Goede and Piyush Bagad, as part of the course Interpretability and Explainability in AI (MSc AI, UvA, June 2022).
|
2 |
+
|
3 |
+
|
4 |
+
This demo shows attributions scores on both the image and the text input when presenting CLIP with a
|
5 |
+
<text,image> pair. Attributions are computed as Gradient-weighted Attention Rollout (Chefer et al.,
|
6 |
+
2021), and can be thought of as an estimate of the effective attention CLIP pays to its input when
|
7 |
+
computing a multimodal representation. <span style="color:red">Warning:</span> Note that attribution
|
8 |
+
methods such as the one from this demo can only give an estimate of the real underlying behavior
|
9 |
+
of the model.
|
entity_description.md
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Automatically generated CLIP grounding explanations for noun chunks,
|
2 |
+
retrieved with the spaCy model.
|
3 |
+
<span style="color:red">Warning:</span> Note that attribution methods
|
4 |
+
such as the one from this demo can only give an estimate of the real
|
5 |
+
underlying behavior of the model.
|
entity_examples.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
image,text
|
2 |
+
"example_images/London.png","In this image we see Big Ben and the London Eye, on both sides of the river Thames."
|
3 |
+
"example_images/harrypotter.png","Hermione, Harry and Ron in their school uniform"
|
examples.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
image,text
|
2 |
+
"example_images/London.png","London Eye"
|
3 |
+
"example_images/London.png","Big Ben"
|
4 |
+
"example_images/harrypotter.png","Harry"
|
5 |
+
"example_images/harrypotter.png","Hermione"
|
6 |
+
"example_images/harrypotter.png","Ron"
|
7 |
+
"example_images/Amsterdam.png","Amsterdam canal"
|
8 |
+
"example_images/Amsterdam.png","Old buildings"
|
9 |
+
"example_images/Amsterdam.png","Pink flowers"
|
10 |
+
"example_images/dogs_on_bed.png","Two dogs"
|
11 |
+
"example_images/dogs_on_bed.png","Book"
|
12 |
+
"example_images/dogs_on_bed.png","Cat"
|
footer.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### Acknowledgements
|
2 |
+
|
3 |
+
This demo was developed for the Interpretability & Explainability in AI course at the University of
|
4 |
+
Amsterdam. We would like to express our thanks to Jelle Zuidema, Jaap Jumelet, Tom Kersten, Christos
|
5 |
+
Athanasiadis, Peter Heemskerk, Zhi Zhang, and all the other TAs who helped us during this course.
|
6 |
+
|
7 |
+
---
|
8 |
+
|
9 |
+
### References
|
10 |
+
|
11 |
+
\[1\]: Chefer, H., Gur, S., & Wolf, L. (2021). Generic attention-model explainability for interpreting bi-modal and encoder-decoder transformers. <br>
|
12 |
+
\[2\]: Abnar, S., & Zuidema, W. (2020). Quantifying attention flow in transformers. arXiv preprint arXiv:2005.00928. <br>
|
13 |
+
\[3\]: [https://samiraabnar.github.io/articles/2020-04/attention_flow](https://samiraabnar.github.io/articles/2020-04/attention_flow) <br>
|