Spaces:

amsterdamNLP
/

CLIP-attention-rollout

Runtime error

Martijn van Beers

Update to blocks for layer selection

dc15657 over 2 years ago

10.5 kB

	import re
	import sys
	import gradio as gr

	# sys.path.append("../")
	sys.path.append("CLIP_explainability/Transformer-MM-Explainability/")

	import torch
	import CLIP.clip as clip

	import spacy
	from PIL import Image, ImageFont, ImageDraw, ImageOps

	from clip_grounding.utils.image import pad_to_square
	from clip_grounding.datasets.png import (
	overlay_relevance_map_on_image,
	)
	from CLIP_explainability.utils import interpret, show_img_heatmap, show_heatmap_on_text

	clip.clip._MODELS = {
	"ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
	"ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
	"ViT-L/14": "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt",
	"ViT-L/14@336px": "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt",
	}

	colour_map = {
	"N": "#f77189",
	"CARDINAL": "#f7764a",
	"DATE": "#d98a32",
	"EVENT": "#bf9632",
	"FAC": "#a99e31",
	"GPE": "#90a531",
	"LANGUAGE": "#68ad31",
	"LAW": "#32b25e",
	"LOC": "#34af86",
	"MONEY": "#35ae9c",
	"NORP": "#36acac",
	"ORDINAL": "#37aabd",
	"ORG": "#39a7d4",
	"PERCENT": "#539ff4",
	"PERSON": "#9890f4",
	"PRODUCT": "#c47ef4",
	"QUANTITY": "#ef5ff4",
	"TIME": "#f565d0",
	"WORK_OF_ART": "#f66baf",
	}

	device = "cuda" if torch.cuda.is_available() else "cpu"

	# nlp = spacy.load("en_core_web_sm")
	import en_core_web_sm
	nlp = en_core_web_sm.load()

	# Gradio Section:
	def update_slider(model):
	if model == "ViT-L/14":
	return gr.update(maximum=23, value=23)
	else:
	return gr.update(maximum=11, value=11)

	def run_demo(*args):
	if len(args) == 4:
	image, text, model_name, vision_layer = args
	elif len(args) == 2:
	image, text = args
	model_name = "ViT-B/32"
	vision_layer = 11
	else:
	raise ValueError("Unexpected number of parameters")

	vision_layer = int(vision_layer)
	model, preprocess = clip.load(model_name, device=device, jit=False)
	orig_image = pad_to_square(image)
	img = preprocess(orig_image).unsqueeze(0).to(device)
	text_input = clip.tokenize([text]).to(device)

	R_text, R_image = interpret(model=model, image=img, texts=text_input, device=device, start_layer=vision_layer)

	image_relevance = show_img_heatmap(R_image[0], img, orig_image=orig_image, device=device)
	overlapped = overlay_relevance_map_on_image(image, image_relevance)

	text_scores, text_tokens_decoded = show_heatmap_on_text(text, text_input, R_text[0])

	highlighted_text = []
	for i, token in enumerate(text_tokens_decoded):
	highlighted_text.append((str(token), float(text_scores[i])))

	return overlapped, highlighted_text


	# Default demo:

	description = """This demo is a copy of the demo CLIPGroundingExlainability built by Paul Hilders, Danilo de Goede and Piyush Bagad, as part of the course Interpretability and Explainability in AI (MSc AI, UvA, June 2022).
	<br> <br>
	This demo shows attributions scores on both the image and the text input when presenting CLIP with a
	<text,image> pair. Attributions are computed as Gradient-weighted Attention Rollout (Chefer et al.,
	2021), and can be thought of as an estimate of the effective attention CLIP pays to its input when
	computing a multimodal representation. <span style="color:red">Warning:</span> Note that attribution
	methods such as the one from this demo can only give an estimate of the real underlying behavior
	of the model."""

	with gr.Blocks(title="CLIP Grounding Explainability") as iface_default:
	gr.Markdown(description)
	with gr.Row():
	with gr.Column() as inputs:
	orig = gr.components.Image(type='pil', label="Original Image")
	description = gr.components.Textbox(label="Image description")
	default_model = gr.Dropdown(label="CLIP Model", choices=['ViT-B/16', 'ViT-B/32', 'ViT-L/14'], value="ViT-B/32")
	default_layer = gr.Slider(label="Vision start layer", minimum=0, maximum=11, step=1, value=11)
	submit = gr.Button("Submit")
	with gr.Column() as outputs:
	image = gr.components.Image(type='pil', label="Output Image")
	text = gr.components.HighlightedText(label="Text importance")
	gr.Examples(
	examples=[
	["example_images/London.png", "London Eye"],
	["example_images/London.png", "Big Ben"],
	["example_images/harrypotter.png", "Harry"],
	["example_images/harrypotter.png", "Hermione"],
	["example_images/harrypotter.png", "Ron"],
	["example_images/Amsterdam.png", "Amsterdam canal"],
	["example_images/Amsterdam.png", "Old buildings"],
	["example_images/Amsterdam.png", "Pink flowers"],
	["example_images/dogs_on_bed.png", "Two dogs"],
	["example_images/dogs_on_bed.png", "Book"],
	["example_images/dogs_on_bed.png", "Cat"]
	],
	inputs=[orig, description]
	)
	default_model.change(update_slider, inputs=default_model, outputs=default_layer)
	submit.click(run_demo, inputs=[orig, description, default_model, default_layer], outputs=[image, text])


	# NER demo:
	def add_label_to_img(img, label, add_entity_label=True):
	img = ImageOps.expand(img, border=45, fill=(255,255,255))
	draw = ImageDraw.Draw(img)
	font = ImageFont.truetype("arial.ttf", 24)
	m = re.match(r".*\((\w+)\)", label)
	if add_entity_label and m is not None:
	cat = m.group(1)
	colours = tuple(map(lambda l: int(''.join(l),16), zip([iter(colour_map[cat][1:])]2)))

	draw.text((5,5), label , align="center", fill=colours, font=font)
	else:
	draw.text((5,5), label, align="center", fill=(0, 0, 0), font=font)

	return img

	def NER_demo(image, text, model_name):

	# As the default image, we run the default demo on the input image and text:
	overlapped, highlighted_text = run_demo(image, text, model_name)

	gallery_images = [add_label_to_img(overlapped, "Complete sentence", add_entity_label=False)]

	labeled_text = dict(
	text=text,
	entities=[],
	)

	# Then, we run the demo for each of the noun chunks in the text:
	for chunk in nlp(text).noun_chunks:
	if len(chunk) == 1 and chunk[0].pos_ == "PRON":
	continue
	chunk_text = chunk.text
	chunk_label = None
	for t in chunk:
	if t.ent_type_ != '':
	chunk_label = t.ent_type_
	break
	if chunk_label is None:
	chunk_label = "N"

	labeled_text['entities'].append({'entity': chunk_label, 'start': chunk.start_char, 'end': chunk.end_char})
	overlapped, highlighted_text = run_demo(image, chunk_text, model_name)
	overlapped_labelled = add_label_to_img(overlapped, f"{chunk_text} ({chunk_label})")
	gallery_images.append(overlapped_labelled)

	return labeled_text, gallery_images


	description_NER = """Automatically generated CLIP grounding explanations for
	noun chunks, retrieved with the spaCy model. <span style="color:red">Warning:</span> Note
	that attribution methods such as the one from this demo can only give an estimate of the real
	underlying behavior of the model."""

	with gr.Blocks(title="Entity Grounding explainability using CLIP") as iface_NER:
	gr.Markdown(description_NER)
	with gr.Row():
	with gr.Column() as inputs:
	img = gr.Image(type='pil', label="Original Image")
	text = gr.components.Textbox(label="Descriptive text")
	ner_model = gr.Dropdown(label="CLIP Model", choices=['ViT-B/16', 'ViT-B/32', 'ViT-L/14'], value="ViT-B/32")
	ner_layer = gr.Slider(label="Vision start layer", minimum=0, maximum=11, step=1, value=11)
	submit = gr.Button("Submit")
	with gr.Column() as outputs:
	text = gr.components.HighlightedText(show_legend=True, color_map=colour_map, label="Noun chunks")
	gallery = gr.components.Gallery(type='pil', label="NER Entity explanations")

	gr.Examples(
	examples=[
	["example_images/London.png", "In this image we see Big Ben and the London Eye, on both sides of the river Thames."],
	["example_images/harrypotter.png", "Hermione, Harry and Ron in their school uniform"],
	],
	inputs=[img, text],
	)
	ner_model.change(update_slider, inputs=ner_model, outputs=ner_layer)
	submit.click(run_demo, inputs=[img, text, ner_model, ner_layer], outputs=[text, gallery])

	demo_tabs = gr.TabbedInterface([iface_default, iface_NER], ["Default", "Entities"])
	with demo_tabs:
	gr.Markdown("""
	### Acknowledgements
	This demo was developed for the Interpretability & Explainability in AI course at the University of
	Amsterdam. We would like to express our thanks to Jelle Zuidema, Jaap Jumelet, Tom Kersten, Christos
	Athanasiadis, Peter Heemskerk, Zhi Zhang, and all the other TAs who helped us during this course.

	---
	### References
	\[1\]: Chefer, H., Gur, S., & Wolf, L. (2021). Generic attention-model explainability for interpreting bi-modal and encoder-decoder transformers. <br>
	\[2\]: Abnar, S., & Zuidema, W. (2020). Quantifying attention flow in transformers. arXiv preprint arXiv:2005.00928. <br>
	\[3\]: [https://samiraabnar.github.io/articles/2020-04/attention_flow](https://samiraabnar.github.io/articles/2020-04/attention_flow) <br>
	""")
	demo_tabs.launch(show_error=True)