File size: 7,608 Bytes
7751ada
0241217
adf3a47
 
0241217
 
 
 
 
 
 
d80767e
c3ca2bd
d80767e
0241217
 
 
 
 
 
 
 
 
0c9f8df
 
0241217
 
adf3a47
 
 
 
 
 
7751ada
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0241217
 
ae6e057
 
 
d80767e
0241217
dc15657
 
 
 
 
 
f992b4c
dc15657
 
f992b4c
 
1b49495
dc15657
f992b4c
 
1fd86da
dc15657
1fd86da
0241217
 
 
 
dc15657
0241217
8f3d1af
0241217
 
8f3d1af
0241217
 
 
 
 
929c841
 
 
 
 
adf3a47
dc15657
adf3a47
dc15657
 
 
 
 
 
 
 
 
 
adf3a47
dc15657
 
 
929c841
 
39c7251
201e3f5
c3ca2bd
39c7251
7751ada
 
 
 
 
 
39c7251
7751ada
c3ca2bd
 
 
8c7ba46
 
6d91375
8c7ba46
cca85c2
7751ada
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c7ba46
7751ada
 
 
 
 
0241217
adf3a47
dc15657
adf3a47
dc15657
 
 
c081dbe
dc15657
 
 
 
 
 
 
adf3a47
dc15657
c081dbe
b43e284
dc15657
b43e284
adf3a47
330a2ff
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import re
import sys
import pathlib
import csv
import gradio as gr

sys.path.append("CLIP_explainability/Transformer-MM-Explainability/")

import torch
import CLIP.clip as clip

import spacy
from PIL import Image, ImageFont, ImageDraw, ImageOps

from clip_grounding.utils.image import pad_to_square
from clip_grounding.datasets.png import (
    overlay_relevance_map_on_image,
)
from CLIP_explainability.utils import interpret, show_img_heatmap, show_heatmap_on_text

clip.clip._MODELS = {
    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
    "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
    "ViT-L/14": "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt",
    "ViT-L/14@336px": "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt",
}

def iter_file(filename):
    with pathlib.Path(filename).open("r") as fh:
        header = next(fh)
        for line in fh:
            yield line

colour_map = {
        "N": "#f77189",
        "CARDINAL": "#f7764a",
        "DATE": "#d98a32",
        "EVENT": "#bf9632",
        "FAC": "#a99e31",
        "GPE": "#90a531",
        "LANGUAGE": "#68ad31",
        "LAW": "#32b25e",
        "LOC": "#34af86",
        "MONEY": "#35ae9c",
        "NORP": "#36acac",
        "ORDINAL": "#37aabd",
        "ORG": "#39a7d4",
        "PERCENT": "#539ff4",
        "PERSON": "#9890f4",
        "PRODUCT": "#c47ef4",
        "QUANTITY": "#ef5ff4",
        "TIME": "#f565d0",
        "WORK_OF_ART": "#f66baf",
    }

device = "cuda" if torch.cuda.is_available() else "cpu"

# nlp = spacy.load("en_core_web_sm")
import en_core_web_sm
nlp = en_core_web_sm.load()

# Gradio Section:
def update_slider(model):
    if model == "ViT-L/14":
        return gr.update(maximum=23, value=23)
    else:
        return gr.update(maximum=11, value=11)

def run_demo(*args):
    if len(args) == 4:
        image, text, model_name, vision_layer = args
    elif len(args) == 2:
        image, text = args
        model_name = "ViT-B/32"
        vision_layer = 11
    else:
        raise ValueError("Unexpected number of parameters")

    vision_layer = int(vision_layer)
    model, preprocess = clip.load(model_name, device=device, jit=False)
    orig_image = pad_to_square(image)
    img = preprocess(orig_image).unsqueeze(0).to(device)
    text_input = clip.tokenize([text]).to(device)

    R_text, R_image = interpret(model=model, image=img, texts=text_input, device=device, start_layer=vision_layer)

    image_relevance = show_img_heatmap(R_image[0], img, orig_image=orig_image, device=device)
    overlapped = overlay_relevance_map_on_image(image, image_relevance)

    text_scores, text_tokens_decoded = show_heatmap_on_text(text, text_input, R_text[0])

    highlighted_text = []
    for i, token in enumerate(text_tokens_decoded):
        highlighted_text.append((str(token), float(text_scores[i])))

    return overlapped, highlighted_text


# Default demo:

examples = list(csv.reader(iter_file("examples.csv")))
with gr.Blocks(title="CLIP Grounding Explainability") as iface_default:
    gr.Markdown(pathlib.Path("description.md").read_text)
    with gr.Row():
        with gr.Column() as inputs:
            orig = gr.components.Image(type='pil', label="Original Image")
            description = gr.components.Textbox(label="Image description")
            default_model = gr.Dropdown(label="CLIP Model", choices=['ViT-B/16', 'ViT-B/32', 'ViT-L/14'], value="ViT-B/32")
            default_layer = gr.Slider(label="Vision start layer", minimum=0, maximum=11, step=1, value=11)
            submit = gr.Button("Submit")
        with gr.Column() as outputs:
            image = gr.components.Image(type='pil', label="Output Image")
            text = gr.components.HighlightedText(label="Text importance")
    gr.Examples(examples=examples, inputs=[orig, description])
    default_model.change(update_slider, inputs=default_model, outputs=default_layer)
    submit.click(run_demo, inputs=[orig, description, default_model, default_layer], outputs=[image, text])


# NER demo:
def add_label_to_img(img, label, add_entity_label=True):
    img = ImageOps.expand(img, border=45, fill=(255,255,255))
    draw = ImageDraw.Draw(img)
    font = ImageFont.truetype("arial.ttf", 24)
    m = re.match(r".*\((\w+)\)", label)
    if add_entity_label and m is not None:
        cat = m.group(1)
        colours = tuple(map(lambda l: int(''.join(l),16), zip(*[iter(colour_map[cat][1:])]*2)))

        draw.text((5,5), label , align="center", fill=colours, font=font)
    else:
        draw.text((5,5), label, align="center", fill=(0, 0, 0), font=font)

    return img

def NER_demo(image, text, model_name):

    # As the default image, we run the default demo on the input image and text:
    overlapped, highlighted_text = run_demo(image, text, model_name)

    gallery_images = [add_label_to_img(overlapped, "Complete sentence", add_entity_label=False)]

    labeled_text = dict(
            text=text,
            entities=[],
        )

    # Then, we run the demo for each of the noun chunks in the text:
    for chunk in nlp(text).noun_chunks:
        if len(chunk) == 1 and chunk[0].pos_ == "PRON":
            continue
        chunk_text = chunk.text
        chunk_label = None
        for t in chunk:
            if t.ent_type_ != '':
                chunk_label = t.ent_type_
                break
        if chunk_label is None:
            chunk_label = "N"

        labeled_text['entities'].append({'entity': chunk_label, 'start': chunk.start_char, 'end': chunk.end_char})
        overlapped, highlighted_text = run_demo(image, chunk_text, model_name)
        overlapped_labelled = add_label_to_img(overlapped, f"{chunk_text} ({chunk_label})")
        gallery_images.append(overlapped_labelled)

    return labeled_text, gallery_images


entity_examples = list(csv.reader(iter_file("entity_examples.csv")))
with gr.Blocks(title="Entity Grounding explainability using CLIP") as iface_NER:
    gr.Markdown(pathlib.Path("entity_description.md").read_text)
    with gr.Row():
        with gr.Column() as inputs:
            img = gr.Image(type='pil', label="Original Image")
            intext = gr.components.Textbox(label="Descriptive text")
            ner_model = gr.Dropdown(label="CLIP Model", choices=['ViT-B/16', 'ViT-B/32', 'ViT-L/14'], value="ViT-B/32")
            ner_layer = gr.Slider(label="Vision start layer", minimum=0, maximum=11, step=1, value=11)
            submit = gr.Button("Submit")
        with gr.Column() as outputs:
            text = gr.components.HighlightedText(show_legend=True, color_map=colour_map, label="Noun chunks")
            gallery = gr.components.Gallery(type='pil', label="NER Entity explanations")

    gr.Examples(examples=entity_examples, inputs=[img, text])
    ner_model.change(update_slider, inputs=ner_model, outputs=ner_layer)
    submit.click(run_demo, inputs=[img, intext, ner_model, ner_layer], outputs=[text, gallery])

demo_tabs = gr.TabbedInterface([iface_default, iface_NER], ["Default", "Entities"])
with demo_tabs:
    gr.Markdown(pathlib.Path("footer.md").read_text)
demo_tabs.launch(show_error=True)