Spaces:

amsterdamNLP
/

CLIP-attention-rollout

Runtime error

App Files Files Community

paul hilders commited on Jul 3, 2022

Commit

0241217

1 Parent(s): 8d581a7

Add new version of demo for IEAI course

Browse files

Files changed (18) hide show

.gitmodules +3 -0
CLIP_explainability/Transformer-MM-Explainability +1 -0
CLIP_explainability/utils.py +152 -0
app.py +67 -0
clip_grounding/datasets/png.py +231 -0
clip_grounding/datasets/png_utils.py +135 -0
clip_grounding/evaluation/clip_on_png.py +362 -0
clip_grounding/evaluation/qualitative_results.py +93 -0
clip_grounding/utils/image.py +46 -0
clip_grounding/utils/io.py +116 -0
clip_grounding/utils/log.py +57 -0
clip_grounding/utils/paths.py +10 -0
clip_grounding/utils/visualize.py +183 -0
example_images/Amsterdam.png +0 -0
example_images/London.png +0 -0
example_images/dogs_on_bed.png +0 -0
example_images/harrypotter.png +0 -0
requirements.txt +121 -0

.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "CLIP_explainability/Transformer-MM-Explainability"]
+	path = CLIP_explainability/Transformer-MM-Explainability
+	url = https://github.com/hila-chefer/Transformer-MM-Explainability.git

CLIP_explainability/Transformer-MM-Explainability ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 6a2c3c9da3fc186878e0c2bcf238c3a4c76d8af8

CLIP_explainability/utils.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import torch
+import CLIP.clip as clip
+from PIL import Image
+import numpy as np
+import cv2
+import matplotlib.pyplot as plt
+from captum.attr import visualization
+import os
+from CLIP.clip.simple_tokenizer import SimpleTokenizer as _Tokenizer
+_tokenizer = _Tokenizer()
+#@title Control context expansion (number of attention layers to consider)
+#@title Number of layers for image Transformer
+start_layer =  11#@param {type:"number"}
+#@title Number of layers for text Transformer
+start_layer_text =  11#@param {type:"number"}
+def interpret(image, texts, model, device):
+    batch_size = texts.shape[0]
+    images = image.repeat(batch_size, 1, 1, 1)
+    logits_per_image, logits_per_text = model(images, texts)
+    probs = logits_per_image.softmax(dim=-1).detach().cpu().numpy()
+    index = [i for i in range(batch_size)]
+    one_hot = np.zeros((logits_per_image.shape[0], logits_per_image.shape[1]), dtype=np.float32)
+    one_hot[torch.arange(logits_per_image.shape[0]), index] = 1
+    one_hot = torch.from_numpy(one_hot).requires_grad_(True)
+    one_hot = torch.sum(one_hot.to(device) * logits_per_image)
+    model.zero_grad()
+    image_attn_blocks = list(dict(model.visual.transformer.resblocks.named_children()).values())
+    num_tokens = image_attn_blocks[0].attn_probs.shape[-1]
+    R = torch.eye(num_tokens, num_tokens, dtype=image_attn_blocks[0].attn_probs.dtype).to(device)
+    R = R.unsqueeze(0).expand(batch_size, num_tokens, num_tokens)
+    for i, blk in enumerate(image_attn_blocks):
+        if i < start_layer:
+            continue
+        grad = torch.autograd.grad(one_hot, [blk.attn_probs], retain_graph=True)[0].detach()
+        cam = blk.attn_probs.detach()
+        cam = cam.reshape(-1, cam.shape[-1], cam.shape[-1])
+        grad = grad.reshape(-1, grad.shape[-1], grad.shape[-1])
+        cam = grad * cam
+        cam = cam.reshape(batch_size, -1, cam.shape[-1], cam.shape[-1])
+        cam = cam.clamp(min=0).mean(dim=1)
+        R = R + torch.bmm(cam, R)
+    image_relevance = R[:, 0, 1:]
+    text_attn_blocks = list(dict(model.transformer.resblocks.named_children()).values())
+    num_tokens = text_attn_blocks[0].attn_probs.shape[-1]
+    R_text = torch.eye(num_tokens, num_tokens, dtype=text_attn_blocks[0].attn_probs.dtype).to(device)
+    R_text = R_text.unsqueeze(0).expand(batch_size, num_tokens, num_tokens)
+    for i, blk in enumerate(text_attn_blocks):
+        if i < start_layer_text:
+            continue
+        grad = torch.autograd.grad(one_hot, [blk.attn_probs], retain_graph=True)[0].detach()
+        cam = blk.attn_probs.detach()
+        cam = cam.reshape(-1, cam.shape[-1], cam.shape[-1])
+        grad = grad.reshape(-1, grad.shape[-1], grad.shape[-1])
+        cam = grad * cam
+        cam = cam.reshape(batch_size, -1, cam.shape[-1], cam.shape[-1])
+        cam = cam.clamp(min=0).mean(dim=1)
+        R_text = R_text + torch.bmm(cam, R_text)
+    text_relevance = R_text
+    return text_relevance, image_relevance
+def show_image_relevance(image_relevance, image, orig_image, device, show=True):
+    # create heatmap from mask on image
+    def show_cam_on_image(img, mask):
+        heatmap = cv2.applyColorMap(np.uint8(255 * mask), cv2.COLORMAP_JET)
+        heatmap = np.float32(heatmap) / 255
+        cam = heatmap + np.float32(img)
+        cam = cam / np.max(cam)
+        return cam
+    # plt.axis('off')
+    # f, axarr = plt.subplots(1,2)
+    # axarr[0].imshow(orig_image)
+    if show:
+        fig, axs = plt.subplots(1, 2)
+        axs[0].imshow(orig_image);
+        axs[0].axis('off');
+    image_relevance = image_relevance.reshape(1, 1, 7, 7)
+    image_relevance = torch.nn.functional.interpolate(image_relevance, size=224, mode='bilinear')
+    image_relevance = image_relevance.reshape(224, 224).to(device).data.cpu().numpy()
+    image_relevance = (image_relevance - image_relevance.min()) / (image_relevance.max() - image_relevance.min())
+    image = image[0].permute(1, 2, 0).data.cpu().numpy()
+    image = (image - image.min()) / (image.max() - image.min())
+    vis = show_cam_on_image(image, image_relevance)
+    vis = np.uint8(255 * vis)
+    vis = cv2.cvtColor(np.array(vis), cv2.COLOR_RGB2BGR)
+    if show:
+        # axar[1].imshow(vis)
+        axs[1].imshow(vis);
+        axs[1].axis('off');
+        # plt.imshow(vis)
+    return image_relevance
+def show_heatmap_on_text(text, text_encoding, R_text, show=True):
+    CLS_idx = text_encoding.argmax(dim=-1)
+    R_text = R_text[CLS_idx, 1:CLS_idx]
+    text_scores = R_text / R_text.sum()
+    text_scores = text_scores.flatten()
+    # print(text_scores)
+    text_tokens=_tokenizer.encode(text)
+    text_tokens_decoded=[_tokenizer.decode([a]) for a in text_tokens]
+    vis_data_records = [visualization.VisualizationDataRecord(text_scores,0,0,0,0,0,text_tokens_decoded,1)]
+    if show:
+        visualization.visualize_text(vis_data_records)
+    return text_scores, text_tokens_decoded
+def show_img_heatmap(image_relevance, image, orig_image, device, show=True):
+    return show_image_relevance(image_relevance, image, orig_image, device, show=show)
+def show_txt_heatmap(text, text_encoding, R_text, show=True):
+    return show_heatmap_on_text(text, text_encoding, R_text, show=show)
+def load_dataset():
+    dataset_path = os.path.join('..', '..', 'dummy-data', '71226_segments' + '.pt')
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    data = torch.load(dataset_path, map_location=device)
+    return data
+class color:
+    PURPLE = '\033[95m'
+    CYAN = '\033[96m'
+    DARKCYAN = '\033[36m'
+    BLUE = '\033[94m'
+    GREEN = '\033[92m'
+    YELLOW = '\033[93m'
+    RED = '\033[91m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+    END = '\033[0m'

app.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import sys
+import gradio as gr
+# sys.path.append("../")
+sys.path.append("CLIP_explainability/Transformer-MM-Explainability/")
+import torch
+import CLIP.clip as clip
+from clip_grounding.utils.image import pad_to_square
+from clip_grounding.datasets.png import (
+    overlay_relevance_map_on_image,
+)
+from CLIP_explainability.utils import interpret, show_img_heatmap, show_heatmap_on_text
+clip.clip._MODELS = {
+    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
+    "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
+}
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
+# Gradio Section:
+def run_demo(image, text):
+    orig_image = pad_to_square(image)
+    img = preprocess(orig_image).unsqueeze(0).to(device)
+    text_input = clip.tokenize([text]).to(device)
+    R_text, R_image = interpret(model=model, image=img, texts=text_input, device=device)
+    image_relevance = show_img_heatmap(R_image[0], img, orig_image=orig_image, device=device, show=False)
+    overlapped = overlay_relevance_map_on_image(image, image_relevance)
+    text_scores, text_tokens_decoded = show_heatmap_on_text(text, text_input, R_text[0], show=False)
+    highlighted_text = []
+    for i, token in enumerate(text_tokens_decoded):
+        highlighted_text.append((str(token), float(text_scores[i])))
+    return overlapped, highlighted_text
+input_img = gr.inputs.Image(type='pil', label="Original Image")
+input_txt = "text"
+inputs = [input_img, input_txt]
+outputs = [gr.inputs.Image(type='pil', label="Output Image"), "highlight"]
+iface = gr.Interface(fn=run_demo,
+                     inputs=inputs,
+                     outputs=outputs,
+                     title="CLIP Grounding Explainability",
+                     description="A demonstration based on the Generic Attention-model Explainability method for Interpreting Bi-Modal Transformers by Chefer et al. (2021): https://github.com/hila-chefer/Transformer-MM-Explainability.",
+                     examples=[["example_images/London.png", "London Eye"],
+                               ["example_images/London.png", "Big Ben"],
+                               ["example_images/harrypotter.png", "Harry"],
+                               ["example_images/harrypotter.png", "Hermione"],
+                               ["example_images/harrypotter.png", "Ron"],
+                               ["example_images/Amsterdam.png", "Amsterdam canal"],
+                               ["example_images/Amsterdam.png", "Old buildings"],
+                               ["example_images/Amsterdam.png", "Pink flowers"],
+                               ["example_images/dogs_on_bed.png", "Two dogs"],
+                               ["example_images/dogs_on_bed.png", "Book"],
+                               ["example_images/dogs_on_bed.png", "Cat"]])
+iface.launch(debug=True)

clip_grounding/datasets/png.py ADDED Viewed

	@@ -0,0 +1,231 @@

+"""
+Dataset object for Panoptic Narrative Grounding.
+Paper: https://openaccess.thecvf.com/content/ICCV2021/papers/Gonzalez_Panoptic_Narrative_Grounding_ICCV_2021_paper.pdf
+"""
+import os
+from os.path import join, isdir, exists
+import torch
+from torch.utils.data import Dataset
+import cv2
+from PIL import Image
+from skimage import io
+import numpy as np
+import textwrap
+import matplotlib.pyplot as plt
+from matplotlib import transforms
+from imgaug.augmentables.segmaps import SegmentationMapsOnImage
+import matplotlib.colors as mc
+from clip_grounding.utils.io import load_json
+from clip_grounding.datasets.png_utils import show_image_and_caption
+class PNG(Dataset):
+    """Panoptic Narrative Grounding."""
+    def __init__(self, dataset_root, split) -> None:
+        """
+        Initializer.
+        Args:
+            dataset_root (str): path to the folder containing PNG dataset
+            split (str): MS-COCO split such as train2017/val2017
+        """
+        super().__init__()
+        assert isdir(dataset_root)
+        self.dataset_root = dataset_root
+        assert split in ["val2017"], f"Split {split} not supported. "\
+            "Currently, only supports split `val2017`."
+        self.split = split
+        self.ann_dir = join(self.dataset_root, "annotations")
+        # feat_dir = join(self.dataset_root, "features")
+        panoptic = load_json(join(self.ann_dir, "panoptic_{:s}.json".format(split)))
+        images = panoptic["images"]
+        self.images_info = {i["id"]: i for i in images}
+        panoptic_anns = panoptic["annotations"]
+        self.panoptic_anns = {int(a["image_id"]): a for a in panoptic_anns}
+        # self.panoptic_pred_path = join(
+        #     feat_dir, split, "panoptic_seg_predictions"
+        # )
+        # assert isdir(self.panoptic_pred_path)
+        panoptic_narratives_path = join(self.dataset_root, "annotations", f"png_coco_{split}.json")
+        self.panoptic_narratives = load_json(panoptic_narratives_path)
+    def __len__(self):
+        return len(self.panoptic_narratives)
+    def get_image_path(self, image_id: str):
+        image_path = join(self.dataset_root, "images", self.split, f"{image_id.zfill(12)}.jpg")
+        return image_path
+    def __getitem__(self, idx: int):
+        narr = self.panoptic_narratives[idx]
+        image_id = narr["image_id"]
+        image_path = self.get_image_path(image_id)
+        assert exists(image_path)
+        image = Image.open(image_path)
+        caption = narr["caption"]
+        # show_single_image(image, title=caption, titlesize=12)
+        segments = narr["segments"]
+        image_id = int(narr["image_id"])
+        panoptic_ann = self.panoptic_anns[image_id]
+        panoptic_ann = self.panoptic_anns[image_id]
+        segment_infos = {}
+        for s in panoptic_ann["segments_info"]:
+            idi = s["id"]
+            segment_infos[idi] = s
+        image_info = self.images_info[image_id]
+        panoptic_segm = io.imread(
+            join(
+                self.ann_dir,
+                "panoptic_segmentation",
+                self.split,
+                "{:012d}.png".format(image_id),
+            )
+        )
+        panoptic_segm = (
+            panoptic_segm[:, :, 0]
+            + panoptic_segm[:, :, 1] * 256
+            + panoptic_segm[:, :, 2] * 256 ** 2
+        )
+        panoptic_ann = self.panoptic_anns[image_id]
+        # panoptic_pred = io.imread(
+        #     join(self.panoptic_pred_path, "{:012d}.png".format(image_id))
+        # )[:, :, 0]
+        # # select a single utterance to visualize
+        # segment = segments[7]
+        # segment_ids = segment["segment_ids"]
+        # segment_mask = np.zeros((image_info["height"], image_info["width"]))
+        # for segment_id in segment_ids:
+        #     segment_id = int(segment_id)
+        #     segment_mask[panoptic_segm == segment_id] = 1.
+        utterances = [s["utterance"] for s in segments]
+        outputs = []
+        for i, segment in enumerate(segments):
+            # create segmentation mask on image
+            segment_ids = segment["segment_ids"]
+            # if no annotation for this word, skip
+            if not len(segment_ids):
+                continue
+            segment_mask = np.zeros((image_info["height"], image_info["width"]))
+            for segment_id in segment_ids:
+                segment_id = int(segment_id)
+                segment_mask[panoptic_segm == segment_id] = 1.
+            # store the outputs
+            text_mask = np.zeros(len(utterances))
+            text_mask[i] = 1.
+            segment_data = dict(
+                image=image,
+                text=utterances,
+                image_mask=segment_mask,
+                text_mask=text_mask,
+                full_caption=caption,
+            )
+            outputs.append(segment_data)
+            # # visualize segmentation mask with associated text
+            # segment_color = "red"
+            # segmap = SegmentationMapsOnImage(
+            #     segment_mask.astype(np.uint8), shape=segment_mask.shape,
+            # )
+            # image_with_segmap = segmap.draw_on_image(np.asarray(image), colors=[0, COLORS[segment_color]])[0]
+            # image_with_segmap = Image.fromarray(image_with_segmap)
+            # colors = ["black" for _ in range(len(utterances))]
+            # colors[i] = segment_color
+            # show_image_and_caption(image_with_segmap, utterances, colors)
+        return outputs
+def overlay_segmask_on_image(image, image_mask, segment_color="red"):
+    segmap = SegmentationMapsOnImage(
+        image_mask.astype(np.uint8), shape=image_mask.shape,
+    )
+    rgb_color = mc.to_rgb(segment_color)
+    rgb_color = 255 * np.array(rgb_color)
+    image_with_segmap = segmap.draw_on_image(np.asarray(image), colors=[0, rgb_color])[0]
+    image_with_segmap = Image.fromarray(image_with_segmap)
+    return image_with_segmap
+def get_text_colors(text, text_mask, segment_color="red"):
+    colors = ["black" for _ in range(len(text))]
+    colors[text_mask.nonzero()[0][0]] = segment_color
+    return colors
+def overlay_relevance_map_on_image(image, heatmap):
+    width, height = image.size
+    # resize the heatmap to image size
+    heatmap = cv2.resize(heatmap, (width, height))
+    heatmap = np.uint8(255 * heatmap)
+    heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)
+    heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
+    # create overlapped super image
+    img = np.asarray(image)
+    super_img = heatmap * 0.4 + img * 0.6
+    super_img = np.uint8(super_img)
+    super_img = Image.fromarray(super_img)
+    return super_img
+def visualize_item(image, text, image_mask, text_mask, segment_color="red"):
+    segmap = SegmentationMapsOnImage(
+        image_mask.astype(np.uint8), shape=image_mask.shape,
+    )
+    rgb_color = mc.to_rgb(segment_color)
+    rgb_color = 255 * np.array(rgb_color)
+    image_with_segmap = segmap.draw_on_image(np.asarray(image), colors=[0, rgb_color])[0]
+    image_with_segmap = Image.fromarray(image_with_segmap)
+    colors = ["black" for _ in range(len(text))]
+    text_idx = text_mask.argmax()
+    colors[text_idx] = segment_color
+    show_image_and_caption(image_with_segmap, text, colors)
+if __name__ == "__main__":
+    from clip_grounding.utils.paths import REPO_PATH, DATASET_ROOTS
+    PNG_ROOT = DATASET_ROOTS["PNG"]
+    dataset = PNG(dataset_root=PNG_ROOT, split="val2017")
+    item = dataset[0]
+    sub_item = item[1]
+    visualize_item(
+        image=sub_item["image"],
+        text=sub_item["text"],
+        image_mask=sub_item["image_mask"],
+        text_mask=sub_item["text_mask"],
+        segment_color="red",
+    )

clip_grounding/datasets/png_utils.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""Helper functions for Panoptic Narrative Grounding."""
+import os
+from os.path import join, isdir, exists
+from typing import List
+import torch
+from PIL import Image
+from skimage import io
+import numpy as np
+import textwrap
+import matplotlib.pyplot as plt
+from matplotlib import transforms
+from imgaug.augmentables.segmaps import SegmentationMapsOnImage
+def rainbow_text(x,y,ls,lc,fig, ax,**kw):
+    """
+    Take a list of strings ``ls`` and colors ``lc`` and place them next to each
+    other, with text ls[i] being shown in color lc[i].
+    Ref: https://stackoverflow.com/questions/9169052/partial-coloring-of-text-in-matplotlib
+    """
+    t = ax.transAxes
+    for s,c in zip(ls,lc):
+        text = ax.text(x,y,s+" ",color=c, transform=t, **kw)
+        text.draw(fig.canvas.get_renderer())
+        ex = text.get_window_extent()
+        t = transforms.offset_copy(text._transform, x=ex.width, units='dots')
+def find_first_index_greater_than(elements, key):
+    return next(x[0] for x in enumerate(elements) if x[1] > key)
+def split_caption_phrases(caption_phrases, colors, max_char_in_a_line=50):
+    char_lengths = np.cumsum([len(x) for x in caption_phrases])
+    thresholds = [max_char_in_a_line * i for i in range(1, 1 + char_lengths[-1] // max_char_in_a_line)]
+    utt_per_line = []
+    col_per_line = []
+    start_index = 0
+    for t in thresholds:
+        index = find_first_index_greater_than(char_lengths, t)
+        utt_per_line.append(caption_phrases[start_index:index])
+        col_per_line.append(colors[start_index:index])
+        start_index = index
+    return utt_per_line, col_per_line
+def show_image_and_caption(image: Image, caption_phrases: list, colors: list = None):
+    if colors is None:
+        colors = ["black" for _ in range(len(caption_phrases))]
+    fig, axes = plt.subplots(1, 2, figsize=(15, 4))
+    ax = axes[0]
+    ax.imshow(image)
+    ax.set_xticks([])
+    ax.set_yticks([])
+    ax = axes[1]
+    utt_per_line, col_per_line = split_caption_phrases(caption_phrases, colors, max_char_in_a_line=50)
+    y = 0.7
+    for U, C in zip(utt_per_line, col_per_line):
+        rainbow_text(
+            0., y,
+            U,
+            C,
+            size=15, ax=ax, fig=fig,
+            horizontalalignment='left',
+            verticalalignment='center',
+        )
+        y -= 0.11
+    ax.axis("off")
+    fig.tight_layout()
+    plt.show()
+def show_images_and_caption(
+        images: List,
+        caption_phrases: list,
+        colors: list = None,
+        image_xlabels: List=[],
+        figsize=None,
+        show=False,
+        xlabelsize=14,
+    ):
+    if colors is None:
+        colors = ["black" for _ in range(len(caption_phrases))]
+    caption_phrases[0] = caption_phrases[0].capitalize()
+    if figsize is None:
+        figsize = (5 * len(images) + 8, 4)
+    if image_xlabels is None:
+        image_xlabels = ["" for _ in range(len(images))]
+    fig, axes = plt.subplots(1, len(images) + 1, figsize=figsize)
+    for i, image in enumerate(images):
+        ax = axes[i]
+        ax.imshow(image)
+        ax.set_xticks([])
+        ax.set_yticks([])
+        ax.set_xlabel(image_xlabels[i], fontsize=xlabelsize)
+    ax = axes[-1]
+    utt_per_line, col_per_line = split_caption_phrases(caption_phrases, colors, max_char_in_a_line=40)
+    y = 0.7
+    for U, C in zip(utt_per_line, col_per_line):
+        rainbow_text(
+            0., y,
+            U,
+            C,
+            size=23, ax=ax, fig=fig,
+            horizontalalignment='left',
+            verticalalignment='center',
+            # weight='bold'
+        )
+        y -= 0.11
+    ax.axis("off")
+    fig.tight_layout()
+    if show:
+        plt.show()

clip_grounding/evaluation/clip_on_png.py ADDED Viewed

	@@ -0,0 +1,362 @@

+"""Evaluates cross-modal correspondence of CLIP on PNG images."""
+import os
+import sys
+from os.path import join, exists
+import warnings
+warnings.filterwarnings('ignore')
+from clip_grounding.utils.paths import REPO_PATH
+sys.path.append(join(REPO_PATH, "CLIP_explainability/Transformer-MM-Explainability/"))
+import torch
+import CLIP.clip as clip
+from PIL import Image
+import numpy as np
+import cv2
+import matplotlib.pyplot as plt
+from captum.attr import visualization
+from torchmetrics import JaccardIndex
+from collections import defaultdict
+from IPython.core.display import display, HTML
+from skimage import filters
+from CLIP_explainability.utils import interpret, show_img_heatmap, show_txt_heatmap, color, _tokenizer
+from clip_grounding.datasets.png import PNG
+from clip_grounding.utils.image import pad_to_square
+from clip_grounding.utils.visualize import show_grid_of_images
+from clip_grounding.utils.log import tqdm_iterator, print_update
+# global usage
+# specify device
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# load CLIP model
+model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
+def show_cam(mask):
+    heatmap = cv2.applyColorMap(np.uint8(255 * mask), cv2.COLORMAP_JET)
+    heatmap = np.float32(heatmap) / 255
+    cam = heatmap
+    cam = cam / np.max(cam)
+    return cam
+def interpret_and_generate(model, img, texts, orig_image, return_outputs=False, show=True):
+    text = clip.tokenize(texts).to(device)
+    R_text, R_image = interpret(model=model, image=img, texts=text, device=device)
+    batch_size = text.shape[0]
+    outputs = []
+    for i in range(batch_size):
+        text_scores, text_tokens_decoded = show_txt_heatmap(texts[i], text[i], R_text[i], show=show)
+        image_relevance = show_img_heatmap(R_image[i], img, orig_image=orig_image, device=device, show=show)
+        plt.show()
+        outputs.append({"text_scores": text_scores, "image_relevance": image_relevance, "tokens_decoded": text_tokens_decoded})
+    if return_outputs:
+        return outputs
+def process_entry_text_to_image(entry, unimodal=False):
+    image = entry['image']
+    text_mask = entry['text_mask']
+    text = entry['text']
+    orig_image = pad_to_square(image)
+    img = preprocess(orig_image).unsqueeze(0).to(device)
+    text_index = text_mask.argmax()
+    texts = [text[text_index]] if not unimodal else ['']
+    return img, texts, orig_image
+def preprocess_ground_truth_mask(mask, resize_shape):
+    mask = Image.fromarray(mask.astype(np.uint8) * 255)
+    mask = pad_to_square(mask, color=0)
+    mask = mask.resize(resize_shape)
+    mask = np.asarray(mask) / 255.
+    return mask
+def apply_otsu_threshold(relevance_map):
+    threshold = filters.threshold_otsu(relevance_map)
+    otsu_map = (relevance_map > threshold).astype(np.uint8)
+    return otsu_map
+def evaluate_text_to_image(method, dataset, debug=False):
+    instance_level_metrics = defaultdict(list)
+    entry_level_metrics = defaultdict(list)
+    jaccard = JaccardIndex(num_classes=2)
+    jaccard = jaccard.to(device)
+    num_iter = len(dataset)
+    if debug:
+        num_iter = 100
+    iterator = tqdm_iterator(range(num_iter), desc=f"Evaluating on {type(dataset).__name__} dataset")
+    for idx in iterator:
+        instance = dataset[idx]
+        instance_iou = 0.
+        for entry in instance:
+            # preprocess the image and text
+            unimodal = True if method == "clip-unimodal" else False
+            test_img, test_texts, orig_image = process_entry_text_to_image(entry, unimodal=unimodal)
+            if method in ["clip", "clip-unimodal"]:
+                # compute the relevance scores
+                outputs = interpret_and_generate(model, test_img, test_texts, orig_image, return_outputs=True, show=False)
+                # use the image relevance score to compute IoU w.r.t. ground truth segmentation masks
+                # NOTE: since we pass single entry (1-sized batch), outputs[0] contains our reqd outputs
+                relevance_map = outputs[0]["image_relevance"]
+            elif method == "random":
+                relevance_map = np.random.uniform(low=0., high=1., size=tuple(test_img.shape[2:]))
+            otsu_relevance_map = apply_otsu_threshold(relevance_map)
+            ground_truth_mask = entry["image_mask"]
+            ground_truth_mask = preprocess_ground_truth_mask(ground_truth_mask, relevance_map.shape)
+            entry_iou = jaccard(
+                torch.from_numpy(otsu_relevance_map).to(device),
+                torch.from_numpy(ground_truth_mask.astype(np.uint8)).to(device),
+            )
+            entry_iou = entry_iou.item()
+            instance_iou += (entry_iou / len(entry))
+            entry_level_metrics["iou"].append(entry_iou)
+        # capture instance (image-sentence pair) level IoU
+        instance_level_metrics["iou"].append(instance_iou)
+    average_metrics = {k: np.mean(v) for k, v in entry_level_metrics.items()}
+    return (
+        average_metrics,
+        instance_level_metrics,
+        entry_level_metrics
+    )
+def process_entry_image_to_text(entry, unimodal=False):
+    if not unimodal:
+        if len(np.asarray(entry["image"]).shape) == 3:
+            mask = np.repeat(np.expand_dims(entry['image_mask'], -1), 3, axis=-1)
+        else:
+            mask = np.asarray(entry['image_mask'])
+        masked_image = (mask * np.asarray(entry['image'])).astype(np.uint8)
+        masked_image = Image.fromarray(masked_image)
+        orig_image = pad_to_square(masked_image)
+        img = preprocess(orig_image).unsqueeze(0).to(device)
+    else:
+        orig_image_shape = max(np.asarray(entry['image']).shape[:2])
+        orig_image = Image.fromarray(np.zeros((orig_image_shape, orig_image_shape, 3), dtype=np.uint8))
+        # orig_image = Image.fromarray(np.random.randint(0, 256, (orig_image_shape, orig_image_shape, 3), dtype=np.uint8))
+        img = preprocess(orig_image).unsqueeze(0).to(device)
+    texts = [' '.join(entry['text'])]
+    return img, texts, orig_image
+def process_text_mask(text, text_mask, tokens):
+    token_level_mask = np.zeros(len(tokens))
+    for label, subtext in zip(text_mask, text):
+        subtext_tokens=_tokenizer.encode(subtext)
+        subtext_tokens_decoded=[_tokenizer.decode([a]) for a in subtext_tokens]
+        if label == 1:
+            start = tokens.index(subtext_tokens_decoded[0])
+            end = tokens.index(subtext_tokens_decoded[-1])
+            token_level_mask[start:end + 1] = 1
+    return token_level_mask
+def evaluate_image_to_text(method, dataset, debug=False, clamp_sentence_len=70):
+    instance_level_metrics = defaultdict(list)
+    entry_level_metrics = defaultdict(list)
+    # skipped if text length > 77 which is CLIP limit
+    num_entries_skipped = 0
+    num_total_entries = 0
+    num_iter = len(dataset)
+    if debug:
+        num_iter = 100
+    jaccard_image_to_text = JaccardIndex(num_classes=2).to(device)
+    iterator = tqdm_iterator(range(num_iter), desc=f"Evaluating on {type(dataset).__name__} dataset")
+    for idx in iterator:
+        instance = dataset[idx]
+        instance_iou = 0.
+        for entry in instance:
+            num_total_entries += 1
+            # preprocess the image and text
+            unimodal = True if method == "clip-unimodal" else False
+            img, texts, orig_image = process_entry_image_to_text(entry, unimodal=unimodal)
+            appx_total_sent_len = np.sum([len(x.split(" ")) for x in texts])
+            if appx_total_sent_len > clamp_sentence_len:
+                # print(f"Skipping an entry since it's text has appx"\
+                # " {appx_total_sent_len} while CLIP cannot process beyond {clamp_sentence_len}")
+                num_entries_skipped += 1
+                continue
+            # compute the relevance scores
+            if method in ["clip", "clip-unimodal"]:
+                try:
+                    outputs = interpret_and_generate(model, img, texts, orig_image, return_outputs=True, show=False)
+                except:
+                    num_entries_skipped += 1
+                    continue
+            elif method == "random":
+                text = texts[0]
+                text_tokens = _tokenizer.encode(text)
+                text_tokens_decoded=[_tokenizer.decode([a]) for a in text_tokens]
+                outputs = [
+                    {
+                        "text_scores": np.random.uniform(low=0., high=1., size=len(text_tokens_decoded)),
+                        "tokens_decoded": text_tokens_decoded,
+                    }
+                ]
+            # use the text relevance score to compute IoU w.r.t. ground truth text masks
+            # NOTE: since we pass single entry (1-sized batch), outputs[0] contains our reqd outputs
+            token_relevance_scores = outputs[0]["text_scores"]
+            if isinstance(token_relevance_scores, torch.Tensor):
+                token_relevance_scores = token_relevance_scores.cpu().numpy()
+            token_relevance_scores = apply_otsu_threshold(token_relevance_scores)
+            token_ground_truth_mask = process_text_mask(entry["text"], entry["text_mask"], outputs[0]["tokens_decoded"])
+            entry_iou = jaccard_image_to_text(
+                torch.from_numpy(token_relevance_scores).to(device),
+                torch.from_numpy(token_ground_truth_mask.astype(np.uint8)).to(device),
+            )
+            entry_iou = entry_iou.item()
+            instance_iou += (entry_iou / len(entry))
+            entry_level_metrics["iou"].append(entry_iou)
+        # capture instance (image-sentence pair) level IoU
+        instance_level_metrics["iou"].append(instance_iou)
+    print(f"CAUTION: Skipped {(num_entries_skipped / num_total_entries) * 100} % since these had length > 77 (CLIP limit).")
+    average_metrics = {k: np.mean(v) for k, v in entry_level_metrics.items()}
+    return (
+        average_metrics,
+        instance_level_metrics,
+        entry_level_metrics
+    )
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser("Evaluate Image-to-Text & Text-to-Image model")
+    parser.add_argument(
+        "--eval_method", type=str, default="clip",
+        choices=["clip", "random", "clip-unimodal"],
+        help="Evaluation method to use",
+    )
+    parser.add_argument(
+        "--ignore_cache", action="store_true",
+        help="Ignore cache and force re-generation of the results",
+    )
+    parser.add_argument(
+        "--debug", action="store_true",
+        help="Run evaluation on a small subset of the dataset",
+    )
+    args = parser.parse_args()
+    print_update("Using evaluation method: {}".format(args.eval_method))
+    clip.clip._MODELS = {
+        "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
+        "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
+    }
+    # specify device
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # load CLIP model
+    print_update("Loading CLIP model...")
+    model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
+    print()
+    # load PNG dataset
+    print_update("Loading PNG dataset...")
+    dataset = PNG(dataset_root=join(REPO_PATH, "data", "panoptic_narrative_grounding"), split="val2017")
+    print()
+    # evaluate
+    # save metrics
+    metrics_dir = join(REPO_PATH, "outputs")
+    os.makedirs(metrics_dir, exist_ok=True)
+    metrics_path = join(metrics_dir, f"{args.eval_method}_on_{type(dataset).__name__}_text2image_metrics.pt")
+    if (not exists(metrics_path)) or args.ignore_cache:
+        print_update("Computing metrics for text-to-image grounding")
+        average_metrics, instance_level_metrics, entry_level_metrics = evaluate_text_to_image(
+            args.eval_method, dataset, debug=args.debug,
+        )
+        metrics = {
+            "average_metrics": average_metrics,
+            "instance_level_metrics":instance_level_metrics,
+            "entry_level_metrics": entry_level_metrics
+        }
+        torch.save(metrics, metrics_path)
+        print("TEXT2IMAGE METRICS SAVED TO:", metrics_path)
+    else:
+        print(f"Metrics already exist at: {metrics_path}. Loading cached metrics.")
+        metrics = torch.load(metrics_path)
+        average_metrics = metrics["average_metrics"]
+    print("TEXT2IMAGE METRICS:", np.round(average_metrics["iou"], 4))
+    print()
+    metrics_path = join(metrics_dir, f"{args.eval_method}_on_{type(dataset).__name__}_image2text_metrics.pt")
+    if (not exists(metrics_path)) or args.ignore_cache:
+        print_update("Computing metrics for image-to-text grounding")
+        average_metrics, instance_level_metrics, entry_level_metrics = evaluate_image_to_text(
+            args.eval_method, dataset, debug=args.debug,
+        )
+        torch.save(
+            {
+                "average_metrics": average_metrics,
+                "instance_level_metrics":instance_level_metrics,
+                "entry_level_metrics": entry_level_metrics
+            },
+            metrics_path,
+        )
+        print("IMAGE2TEXT METRICS SAVED TO:", metrics_path)
+    else:
+        print(f"Metrics already exist at: {metrics_path}. Loading cached metrics.")
+        metrics = torch.load(metrics_path)
+        average_metrics = metrics["average_metrics"]
+    print("IMAGE2TEXT METRICS:", np.round(average_metrics["iou"], 4))

clip_grounding/evaluation/qualitative_results.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""Converts notebook for qualitative results to a python script."""
+import sys
+from os.path import join
+from clip_grounding.utils.paths import REPO_PATH
+sys.path.append(join(REPO_PATH, "CLIP_explainability/Transformer-MM-Explainability/"))
+import os
+import torch
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.patches import Patch
+import CLIP.clip as clip
+import cv2
+from PIL import Image
+from glob import glob
+from natsort import natsorted
+from clip_grounding.utils.paths import REPO_PATH
+from clip_grounding.utils.io import load_json
+from clip_grounding.utils.visualize import set_latex_fonts, show_grid_of_images
+from clip_grounding.utils.image import pad_to_square
+from clip_grounding.datasets.png_utils import show_images_and_caption
+from clip_grounding.datasets.png import (
+    PNG,
+    visualize_item,
+    overlay_segmask_on_image,
+    overlay_relevance_map_on_image,
+    get_text_colors,
+)
+from clip_grounding.evaluation.clip_on_png import (
+    process_entry_image_to_text,
+    process_entry_text_to_image,
+    interpret_and_generate,
+)
+# load dataset
+dataset = PNG(dataset_root=join(REPO_PATH, "data/panoptic_narrative_grounding"), split="val2017")
+# load CLIP model
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
+def visualize_entry_text_to_image(entry, pad_images=True, figsize=(18, 5)):
+    test_img, test_texts, orig_image = process_entry_text_to_image(entry, unimodal=False)
+    outputs = interpret_and_generate(model, test_img, test_texts, orig_image, return_outputs=True, show=False)
+    relevance_map = outputs[0]["image_relevance"]
+    image_with_mask = overlay_segmask_on_image(entry["image"], entry["image_mask"])
+    if pad_images:
+        image_with_mask = pad_to_square(image_with_mask)
+    image_with_relevance_map = overlay_relevance_map_on_image(entry["image"], relevance_map)
+    if pad_images:
+        image_with_relevance_map = pad_to_square(image_with_relevance_map)
+    text_colors = get_text_colors(entry["text"], entry["text_mask"])
+    show_images_and_caption(
+        [image_with_mask, image_with_relevance_map],
+        entry["text"], text_colors, figsize=figsize,
+        image_xlabels=["Ground truth segmentation", "Predicted relevance map"]
+    )
+def create_and_save_gif(filenames, save_path, **kwargs):
+    import imageio
+    images = []
+    for filename in filenames:
+        images.append(imageio.imread(filename))
+    imageio.mimsave(save_path, images, **kwargs)
+idx = 100
+instance = dataset[idx]
+instance_dir = join(REPO_PATH, "figures", f"instance-{idx}")
+os.makedirs(instance_dir, exist_ok=True)
+for i, entry in enumerate(instance):
+    del entry["full_caption"]
+    visualize_entry_text_to_image(entry, pad_images=False, figsize=(19, 4))
+    save_path = instance_dir
+    plt.savefig(join(instance_dir, f"viz-{i}.png"), bbox_inches="tight")
+filenames = natsorted(glob(join(instance_dir, "viz-*.png")))
+save_path = join(REPO_PATH, "media", "sample.gif")
+create_and_save_gif(filenames, save_path, duration=3)

clip_grounding/utils/image.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""Image operations."""
+from copy import deepcopy
+from PIL import Image
+def center_crop(im: Image):
+    width, height = im.size
+    new_width = width if width < height else height
+    new_height = height if height < width else width
+    left = (width - new_width)/2
+    top = (height - new_height)/2
+    right = (width + new_width)/2
+    bottom = (height + new_height)/2
+    # Crop the center of the image
+    im = im.crop((left, top, right, bottom))
+    return im
+def pad_to_square(im: Image, color=(0, 0, 0)):
+    im = deepcopy(im)
+    width, height = im.size
+    vert_pad = (max(width, height) - height) // 2
+    hor_pad = (max(width, height) - width) // 2
+    if len(im.mode) == 3:
+        color = (0, 0, 0)
+    elif len(im.mode) == 1:
+        color = 0
+    else:
+        raise ValueError(f"Image mode not supported. Image has {im.mode} channels.")
+    return add_margin(im, vert_pad, hor_pad, vert_pad, hor_pad, color=color)
+def add_margin(pil_img, top, right, bottom, left, color=(0, 0, 0)):
+    """Ref: https://note.nkmk.me/en/python-pillow-add-margin-expand-canvas/"""
+    width, height = pil_img.size
+    new_width = width + right + left
+    new_height = height + top + bottom
+    result = Image.new(pil_img.mode, (new_width, new_height), color)
+    result.paste(pil_img, (left, top))
+    return result

clip_grounding/utils/io.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""
+Utilities for input-output loading/saving.
+"""
+from typing import Any, List
+import yaml
+import pickle
+import json
+class PrettySafeLoader(yaml.SafeLoader):
+    """Custom loader for reading YAML files"""
+    def construct_python_tuple(self, node):
+        return tuple(self.construct_sequence(node))
+PrettySafeLoader.add_constructor(
+    u'tag:yaml.org,2002:python/tuple',
+    PrettySafeLoader.construct_python_tuple
+)
+def load_yml(path: str, loader_type: str = 'default'):
+    """Read params from a yml file.
+    Args:
+        path (str): path to the .yml file
+        loader_type (str, optional): type of loader used to load yml files. Defaults to 'default'.
+    Returns:
+        Any: object (typically dict) loaded from .yml file
+    """
+    assert loader_type in ['default', 'safe']
+    loader = yaml.Loader if (loader_type == "default") else PrettySafeLoader
+    with open(path, 'r') as f:
+        data = yaml.load(f, Loader=loader)
+    return data
+def save_yml(data: dict, path: str):
+    """Save params in the given yml file path.
+    Args:
+        data (dict): data object to save
+        path (str): path to .yml file to be saved
+    """
+    with open(path, 'w') as f:
+        yaml.dump(data, f, default_flow_style=False)
+def load_pkl(path: str, encoding: str = "ascii") -> Any:
+    """Loads a .pkl file.
+    Args:
+        path (str): path to the .pkl file
+        encoding (str, optional): encoding to use for loading. Defaults to "ascii".
+    Returns:
+        Any: unpickled object
+    """
+    return pickle.load(open(path, "rb"), encoding=encoding)
+def save_pkl(data: Any, path: str) -> None:
+    """Saves given object into .pkl file
+    Args:
+        data (Any): object to be saved
+        path (str): path to the location to be saved at
+    """
+    with open(path, 'wb') as f:
+        pickle.dump(data, f)
+def load_json(path: str) -> dict:
+    """Helper to load json file"""
+    with open(path, 'rb') as f:
+        data = json.load(f)
+    return data
+def save_json(data: dict, path: str):
+    """Helper to save `dict` as .json file."""
+    with open(path, 'w') as f:
+        json.dump(data, f)
+def load_txt(path: str) -> List:
+    """Loads lines of a .txt file.
+    Args:
+        path (str): path to the .txt file
+    Returns:
+        List: lines of .txt file
+    """
+    with open(path) as f:
+        lines = f.read().splitlines()
+    return lines
+def save_txt(data: dict, path: str):
+    """Writes data (lines) to a txt file.
+    Args:
+        data (dict): List of strings
+        path (str): path to .txt file
+    """
+    assert isinstance(data, list)
+    lines = "\n".join(data)
+    with open(path, "w") as f:
+        f.write(str(lines))

clip_grounding/utils/log.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""Utilities for logging"""
+import logging
+from tqdm import tqdm
+from termcolor import colored
+def color(string: str, color_name: str = 'yellow') -> str:
+    """Returns colored string for output to terminal"""
+    return colored(string, color_name)
+def print_update(message: str, width: int = 140, fillchar: str = ":", color="yellow") -> str:
+    """Prints an update message
+    Args:
+        message (str): message
+        width (int): width of new update message
+        fillchar (str): character to be filled to L and R of message
+    Returns:
+        str: print-ready update message
+    """
+    message = message.center(len(message) + 2, " ")
+    print(colored(message.center(width, fillchar), color))
+def set_logger(log_path):
+    """Set the logger to log info in terminal and file `log_path`.
+    Args:
+        log_path (str): path to the log file
+    """
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    if not logger.handlers:
+        # Logging to a file
+        file_handler = logging.FileHandler(log_path)
+        file_handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
+        logger.addHandler(file_handler)
+        # Logging to console
+        stream_handler = logging.StreamHandler()
+        stream_handler.setFormatter(logging.Formatter('%(message)s'))
+        logger.addHandler(stream_handler)
+def tqdm_iterator(items, desc=None, bar_format=None, **kwargs):
+    tqdm._instances.clear()
+    iterator = tqdm(
+        items,
+        desc=desc,
+        bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}',
+        **kwargs,
+    )
+    return iterator

clip_grounding/utils/paths.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""Path helpers for the relfm project."""
+from os.path import join, abspath, dirname
+REPO_PATH = dirname(dirname(dirname(abspath(__file__))))
+DATA_ROOT = join(REPO_PATH, "data")
+DATASET_ROOTS = {
+    "PNG": join(DATA_ROOT, "panoptic_narrative_grounding"),
+}

clip_grounding/utils/visualize.py ADDED Viewed

	@@ -0,0 +1,183 @@

+"""Helpers for visualization"""
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+import cv2
+from PIL import Image
+# define predominanat colors
+COLORS = {
+    "pink": (242, 116, 223),
+    "cyan": (46, 242, 203),
+    "red": (255, 0, 0),
+    "green": (0, 255, 0),
+    "blue": (0, 0, 255),
+    "yellow": (255, 255, 0),
+}
+def show_single_image(image: np.ndarray, figsize: tuple = (8, 8), title: str = None, titlesize=18, cmap: str = None, ticks=False, save=False, save_path=None):
+    """Show a single image."""
+    fig, ax = plt.subplots(1, 1, figsize=figsize)
+    if isinstance(image, Image.Image):
+        image = np.asarray(image)
+    ax.set_title(title, fontsize=titlesize)
+    ax.imshow(image, cmap=cmap)
+    if not ticks:
+        ax.set_xticks([])
+        ax.set_yticks([])
+    if save:
+        plt.savefig(save_path, bbox_inches='tight')
+    plt.show()
+def show_grid_of_images(
+        images: np.ndarray, n_cols: int = 4, figsize: tuple = (8, 8),
+        cmap=None, subtitles=None, title=None, subtitlesize=18,
+        save=False, save_path=None, titlesize=20,
+    ):
+    """Show a grid of images."""
+    n_cols = min(n_cols, len(images))
+    copy_of_images = images.copy()
+    for i, image in enumerate(copy_of_images):
+        if isinstance(image, Image.Image):
+            image = np.asarray(image)
+            images[i] = image
+    if subtitles is None:
+        subtitles = [None] * len(images)
+    n_rows = int(np.ceil(len(images) / n_cols))
+    fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
+    for i, ax in enumerate(axes.flat):
+        if i < len(images):
+            if len(images[i].shape) == 2 and cmap is None:
+                cmap="gray"
+            ax.imshow(images[i], cmap=cmap)
+            ax.set_title(subtitles[i], fontsize=subtitlesize)
+            ax.axis('off')
+    fig.set_tight_layout(True)
+    plt.suptitle(title, y=0.8, fontsize=titlesize)
+    if save:
+        plt.savefig(save_path, bbox_inches='tight')
+        plt.close()
+    else:
+        plt.show()
+def show_keypoint_matches(
+        img1, kp1, img2, kp2, matches,
+        K=10, figsize=(10, 5), drawMatches_args=dict(matchesThickness=3, singlePointColor=(0, 0, 0)),
+        choose_matches="random",
+    ):
+    """Displays matches found in the pair of images"""
+    if choose_matches == "random":
+        selected_matches = np.random.choice(matches, K)
+    elif choose_matches == "all":
+        K = len(matches)
+        selected_matches = matches
+    elif choose_matches == "topk":
+        selected_matches = matches[:K]
+    else:
+        raise ValueError(f"Unknown value for choose_matches: {choose_matches}")
+    # color each match with a different color
+    cmap = matplotlib.cm.get_cmap('gist_rainbow', K)
+    colors = [[int(x*255) for x in cmap(i)[:3]] for i in np.arange(0,K)]
+    drawMatches_args.update({"matchColor": -1, "singlePointColor": (100, 100, 100)})
+    img3 = cv2.drawMatches(img1, kp1, img2, kp2, selected_matches, outImg=None, **drawMatches_args)
+    show_single_image(
+        img3,
+        figsize=figsize,
+        title=f"[{choose_matches.upper()}] Selected K = {K} matches between the pair of images.",
+    )
+    return img3
+def draw_kps_on_image(image: np.ndarray, kps: np.ndarray, color=COLORS["red"], radius=3, thickness=-1, return_as="numpy"):
+    """
+    Draw keypoints on image.
+    Args:
+        image: Image to draw keypoints on.
+        kps: Keypoints to draw. Note these should be in (x, y) format.
+    """
+    if isinstance(image, Image.Image):
+        image = np.asarray(image)
+    for kp in kps:
+        image = cv2.circle(
+            image, (int(kp[0]), int(kp[1])), radius=radius, color=color, thickness=thickness)
+    if return_as == "PIL":
+        return Image.fromarray(image)
+    return image
+def get_concat_h(im1, im2):
+    """Concatenate two images horizontally"""
+    dst = Image.new('RGB', (im1.width + im2.width, im1.height))
+    dst.paste(im1, (0, 0))
+    dst.paste(im2, (im1.width, 0))
+    return dst
+def get_concat_v(im1, im2):
+    """Concatenate two images vertically"""
+    dst = Image.new('RGB', (im1.width, im1.height + im2.height))
+    dst.paste(im1, (0, 0))
+    dst.paste(im2, (0, im1.height))
+    return dst
+def show_images_with_keypoints(images: list, kps: list, radius=15, color=(0, 220, 220), figsize=(10, 8), return_images=False, save=False, save_path="sample.png"):
+    assert len(images) == len(kps)
+    # generate
+    images_with_kps = []
+    for i in range(len(images)):
+        img_with_kps = draw_kps_on_image(images[i], kps[i], radius=radius, color=color, return_as="PIL")
+        images_with_kps.append(img_with_kps)
+    # show
+    show_grid_of_images(images_with_kps, n_cols=len(images), figsize=figsize, save=save, save_path=save_path)
+    if return_images:
+        return images_with_kps
+def set_latex_fonts(usetex=True, fontsize=14, show_sample=False, **kwargs):
+    try:
+        plt.rcParams.update({
+            "text.usetex": usetex,
+            "font.family": "serif",
+            "font.serif": ["Computer Modern Roman"],
+            "font.size": fontsize,
+            **kwargs,
+        })
+        if show_sample:
+            plt.figure()
+            plt.title("Sample $y = x^2$")
+            plt.plot(np.arange(0, 10), np.arange(0, 10)**2, "--o")
+            plt.grid()
+            plt.show()
+    except:
+        print("Failed to setup LaTeX fonts. Proceeding without.")
+        pass
+def get_colors(num_colors, palette="jet"):
+    cmap = plt.get_cmap(palette)
+    colors = [cmap(i) for i in np.linspace(0, 1, num_colors)]
+    return colors

example_images/Amsterdam.png ADDED Viewed

example_images/London.png ADDED Viewed

example_images/dogs_on_bed.png ADDED Viewed

example_images/harrypotter.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,121 @@

+anyio==3.6.1
+appnope==0.1.3
+argon2-cffi==21.3.0
+argon2-cffi-bindings==21.2.0
+asttokens==2.0.5
+attrs==21.4.0
+Babel==2.10.3
+backcall==0.2.0
+beautifulsoup4==4.11.1
+bleach==5.0.0
+captum==0.5.0
+certifi==2022.6.15
+cffi==1.15.0
+charset-normalizer==2.0.12
+cycler==0.11.0
+debugpy==1.6.0
+decorator==5.1.1
+defusedxml==0.7.1
+entrypoints==0.4
+executing==0.8.3
+fastjsonschema==2.15.3
+fonttools==4.33.3
+ftfy==6.1.1
+htmlmin==0.1.12
+idna==3.3
+ImageHash==4.2.1
+imageio==2.19.3
+imgaug==0.4.0
+importlib-metadata==4.11.4
+ipdb==0.13.9
+ipykernel==6.15.0
+ipython==8.4.0
+ipython-genutils==0.2.0
+ipywidgets==7.7.1
+jedi==0.18.1
+Jinja2==3.1.2
+joblib==1.1.0
+json5==0.9.8
+jsonschema==4.6.0
+jupyter-client==7.3.4
+jupyter-core==4.10.0
+jupyter-server==1.18.0
+jupyterlab==3.4.3
+jupyterlab-pygments==0.2.2
+jupyterlab-server==2.14.0
+jupyterlab-widgets==1.1.1
+kiwisolver==1.4.3
+MarkupSafe==2.1.1
+matplotlib==3.5.2
+matplotlib-inline==0.1.3
+missingno==0.5.1
+mistune==0.8.4
+multimethod==1.8
+natsort==8.1.0
+nbclassic==0.3.7
+nbclient==0.6.4
+nbconvert==6.5.0
+nbformat==5.4.0
+nest-asyncio==1.5.5
+networkx==2.8.4
+notebook==6.4.12
+notebook-shim==0.1.0
+numpy==1.23.0
+opencv-python==4.6.0.66
+packaging==21.3
+pandas==1.4.3
+pandas-profiling==3.2.0
+pandocfilters==1.5.0
+parso==0.8.3
+pexpect==4.8.0
+phik==0.12.2
+pickleshare==0.7.5
+Pillow==9.1.1
+prometheus-client==0.14.1
+prompt-toolkit==3.0.29
+psutil==5.9.1
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pycparser==2.21
+pydantic==1.9.1
+Pygments==2.12.0
+pyparsing==3.0.9
+pyrsistent==0.18.1
+python-dateutil==2.8.2
+pytz==2022.1
+PyWavelets==1.3.0
+PyYAML==6.0
+pyzmq==23.2.0
+regex==2022.6.2
+requests==2.28.0
+scikit-image==0.19.3
+scikit-learn==1.1.1
+scipy==1.8.1
+seaborn==0.11.2
+Send2Trash==1.8.0
+Shapely==1.8.2
+six==1.16.0
+sniffio==1.2.0
+soupsieve==2.3.2.post1
+stack-data==0.3.0
+tangled-up-in-unicode==0.2.0
+termcolor==1.1.0
+terminado==0.15.0
+threadpoolctl==3.1.0
+tifffile==2022.5.4
+tinycss2==1.1.1
+toml==0.10.2
+torch==1.11.0
+torchmetrics==0.9.1
+torchvision==0.12.0
+tornado==6.1
+tqdm==4.64.0
+traitlets==5.3.0
+typing_extensions==4.2.0
+urllib3==1.26.9
+visions==0.7.4
+wcwidth==0.2.5
+webencodings==0.5.1
+websocket-client==1.3.3
+widgetsnbextension==3.6.1
+zipp==3.8.0