Spaces:

huzey
/

ncut-pytorch

Running on Zero

App Files Files Community

huzey commited on Aug 26, 2024

Commit

68b0288

1 Parent(s): 3c982eb

update model

Browse files

Files changed (18) hide show

app.py +68 -554
backbone.py +394 -0
images/example_a.jpg +0 -0
images/image_0.jpg +0 -0
images/image_0_small.jpg +0 -0
images/image_1.jpg +0 -0
images/image_1_small.jpg +0 -0
images/image_2.jpg +0 -0
images/image_2_small.jpg +0 -0
images/image_3.jpg +0 -0
images/image_3_small.jpg +0 -0
images/image_5.jpg +0 -0
images/image_5_small.jpg +0 -0
images/ncut_0_small.jpg +0 -0
images/ncut_1_small.jpg +0 -0
images/ncut_2_small.jpg +0 -0
images/ncut_3_small.jpg +0 -0
images/ncut_5_small.jpg +0 -0

app.py CHANGED Viewed

@@ -1,536 +1,17 @@
-from typing import Optional, Tuple
-from einops import rearrange
 import torch
-import torch.nn.functional as F
 from PIL import Image
-from torch import nn
 import numpy as np
-import os
 import time
 import gradio as gr
-import spaces
-USE_CUDA = torch.cuda.is_available()
-print("CUDA is available:", USE_CUDA)
-def transform_images(images, resolution=(1024, 1024)):
-    images = [image.convert("RGB").resize(resolution) for image in images]
-    # Convert to torch tensor
-    images = [torch.tensor(np.array(image).transpose(2, 0, 1)).float() / 255 for image in images]
-    # Normalize
-    images = [(image - 0.5) / 0.5 for image in images]
-    images = torch.stack(images)
-    return images
-class MobileSAM(nn.Module):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        from mobile_sam import sam_model_registry
-        url = 'https://raw.githubusercontent.com/ChaoningZhang/MobileSAM/master/weights/mobile_sam.pt'
-        model_type = "vit_t"
-        sam_checkpoint = "mobile_sam.pt"
-        if not os.path.exists(sam_checkpoint):
-            import requests
-            r = requests.get(url)
-            with open(sam_checkpoint, 'wb') as f:
-                f.write(r.content)
-        mobile_sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
-        def new_forward_fn(self, x):
-            shortcut = x
-            x = self.conv1(x)
-            x = self.act1(x)
-            x = self.conv2(x)
-            x = self.act2(x)
-            self.attn_output = rearrange(x.clone(), "b c h w -> b h w c")
-            x = self.conv3(x)
-            self.mlp_output = rearrange(x.clone(), "b c h w -> b h w c")
-            x = self.drop_path(x)
-            x += shortcut
-            x = self.act3(x)
-            self.block_output = rearrange(x.clone(), "b c h w -> b h w c")
-            return x
-        setattr(mobile_sam.image_encoder.layers[0].blocks[0].__class__, "forward", new_forward_fn)
-        def new_forward_fn2(self, x):
-            H, W = self.input_resolution
-            B, L, C = x.shape
-            assert L == H * W, "input feature has wrong size"
-            res_x = x
-            if H == self.window_size and W == self.window_size:
-                x = self.attn(x)
-            else:
-                x = x.view(B, H, W, C)
-                pad_b = (self.window_size - H %
-                            self.window_size) % self.window_size
-                pad_r = (self.window_size - W %
-                            self.window_size) % self.window_size
-                padding = pad_b > 0 or pad_r > 0
-                if padding:
-                    x = F.pad(x, (0, 0, 0, pad_r, 0, pad_b))
-                pH, pW = H + pad_b, W + pad_r
-                nH = pH // self.window_size
-                nW = pW // self.window_size
-                # window partition
-                x = x.view(B, nH, self.window_size, nW, self.window_size, C).transpose(2, 3).reshape(
-                    B * nH * nW, self.window_size * self.window_size, C)
-                x = self.attn(x)
-                # window reverse
-                x = x.view(B, nH, nW, self.window_size, self.window_size,
-                            C).transpose(2, 3).reshape(B, pH, pW, C)
-                if padding:
-                    x = x[:, :H, :W].contiguous()
-                x = x.view(B, L, C)
-            hw = np.sqrt(x.shape[1]).astype(int)
-            self.attn_output = rearrange(x.clone(), "b (h w) c -> b h w c", h=hw)
-            x = res_x + self.drop_path(x)
-            x = x.transpose(1, 2).reshape(B, C, H, W)
-            x = self.local_conv(x)
-            x = x.view(B, C, L).transpose(1, 2)
-            mlp_output = self.mlp(x)
-            self.mlp_output = rearrange(mlp_output.clone(), "b (h w) c -> b h w c", h=hw)
-            x = x + self.drop_path(mlp_output)
-            self.block_output = rearrange(x.clone(), "b (h w) c -> b h w c", h=hw)
-            return x
-        setattr(mobile_sam.image_encoder.layers[1].blocks[0].__class__, "forward", new_forward_fn2)
-        mobile_sam.eval()
-        self.image_encoder = mobile_sam.image_encoder
-    @torch.no_grad()
-    def forward(self, x):
-        with torch.no_grad():
-            x = torch.nn.functional.interpolate(x, size=(1024, 1024), mode="bilinear")
-        out = self.image_encoder(x)
-        attn_outputs, mlp_outputs, block_outputs = [], [], []
-        for i_layer in range(len(self.image_encoder.layers)):
-            for i_block in range(len(self.image_encoder.layers[i_layer].blocks)):
-                blk = self.image_encoder.layers[i_layer].blocks[i_block]
-                attn_outputs.append(blk.attn_output)
-                mlp_outputs.append(blk.mlp_output)
-                block_outputs.append(blk.block_output)
-        return attn_outputs, mlp_outputs, block_outputs
-mobilesam = MobileSAM()
-def image_mobilesam_feature(
-    images,
-    node_type="block",
-    layer=-1,
-):
-    print("Running MobileSAM")
-    global USE_CUDA
-    if USE_CUDA:
-        images = images.cuda()
-    global mobilesam
-    feat_extractor = mobilesam
-    if USE_CUDA:
-        feat_extractor = feat_extractor.cuda()
-    print("images shape:", images.shape)
-    # attn_outputs, mlp_outputs, block_outputs = [], [], []
-    outputs = []
-    for i in range(images.shape[0]):
-        attn_output, mlp_output, block_output = feat_extractor(
-            images[i].unsqueeze(0)
-        )
-        out_dict = {
-            "attn": attn_output,
-            "mlp": mlp_output,
-            "block": block_output,
-        }
-        out = out_dict[node_type]
-        out = out[layer]
-        outputs.append(out)
-    outputs = torch.cat(outputs, dim=0)
-    return outputs
-class SAM(torch.nn.Module):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-        from segment_anything import sam_model_registry, SamPredictor
-        from segment_anything.modeling.sam import Sam
-        checkpoint = "sam_vit_b_01ec64.pth"
-        if not os.path.exists(checkpoint):
-            checkpoint_url = 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth'
-            import requests
-            r = requests.get(checkpoint_url)
-            with open(checkpoint, 'wb') as f:
-                f.write(r.content)
-        sam: Sam = sam_model_registry["vit_b"](checkpoint=checkpoint)
-        from segment_anything.modeling.image_encoder import (
-            window_partition,
-            window_unpartition,
-        )
-        def new_block_forward(self, x: torch.Tensor) -> torch.Tensor:
-            shortcut = x
-            x = self.norm1(x)
-            # Window partition
-            if self.window_size > 0:
-                H, W = x.shape[1], x.shape[2]
-                x, pad_hw = window_partition(x, self.window_size)
-            x = self.attn(x)
-            # Reverse window partition
-            if self.window_size > 0:
-                x = window_unpartition(x, self.window_size, pad_hw, (H, W))
-            self.attn_output = x.clone()
-            x = shortcut + x
-            mlp_outout = self.mlp(self.norm2(x))
-            self.mlp_output = mlp_outout.clone()
-            x = x + mlp_outout
-            self.block_output = x.clone()
-            return x
-        setattr(sam.image_encoder.blocks[0].__class__, "forward", new_block_forward)
-        self.image_encoder = sam.image_encoder
-        self.image_encoder.eval()
-    @torch.no_grad()
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        with torch.no_grad():
-            x = torch.nn.functional.interpolate(x, size=(1024, 1024), mode="bilinear")
-        out = self.image_encoder(x)
-        attn_outputs, mlp_outputs, block_outputs = [], [], []
-        for i, blk in enumerate(self.image_encoder.blocks):
-            attn_outputs.append(blk.attn_output)
-            mlp_outputs.append(blk.mlp_output)
-            block_outputs.append(blk.block_output)
-        attn_outputs = torch.stack(attn_outputs)
-        mlp_outputs = torch.stack(mlp_outputs)
-        block_outputs = torch.stack(block_outputs)
-        return attn_outputs, mlp_outputs, block_outputs
-sam = SAM()
-def image_sam_feature(
-    images,
-    node_type="block",
-    layer=-1,
-):
-    global USE_CUDA
-    if USE_CUDA:
-        images = images.cuda()
-    global sam
-    feat_extractor = sam
-    if USE_CUDA:
-        feat_extractor = feat_extractor.cuda()
-    # attn_outputs, mlp_outputs, block_outputs = [], [], []
-    outputs = []
-    for i in range(images.shape[0]):
-        attn_output, mlp_output, block_output = feat_extractor(
-            images[i].unsqueeze(0)
-        )
-        out_dict = {
-            "attn": attn_output,
-            "mlp": mlp_output,
-            "block": block_output,
-        }
-        out = out_dict[node_type]
-        out = out[layer]
-        outputs.append(out)
-    outputs = torch.cat(outputs, dim=0)
-    return outputs
-class DiNOv2(torch.nn.Module):
-    def __init__(self, ver="dinov2_vitb14_reg"):
-        super().__init__()
-        self.dinov2 = torch.hub.load("facebookresearch/dinov2", ver)
-        self.dinov2.requires_grad_(False)
-        self.dinov2.eval()
-        def new_block_forward(self, x: torch.Tensor) -> torch.Tensor:
-            def attn_residual_func(x):
-                return self.ls1(self.attn(self.norm1(x)))
-            def ffn_residual_func(x):
-                return self.ls2(self.mlp(self.norm2(x)))
-            attn_output = attn_residual_func(x)
-            self.attn_output = attn_output.clone()
-            x = x + attn_output
-            mlp_output = ffn_residual_func(x)
-            self.mlp_output = mlp_output.clone()
-            x = x + mlp_output
-            block_output = x
-            self.block_output = block_output.clone()
-            return x
-        setattr(self.dinov2.blocks[0].__class__, "forward", new_block_forward)
-    @torch.no_grad()
-    def forward(self, x):
-        out = self.dinov2(x)
-        attn_outputs, mlp_outputs, block_outputs = [], [], []
-        for i, blk in enumerate(self.dinov2.blocks):
-            attn_outputs.append(blk.attn_output)
-            mlp_outputs.append(blk.mlp_output)
-            block_outputs.append(blk.block_output)
-        attn_outputs = torch.stack(attn_outputs)
-        mlp_outputs = torch.stack(mlp_outputs)
-        block_outputs = torch.stack(block_outputs)
-        return attn_outputs, mlp_outputs, block_outputs
-dinov2 = DiNOv2()
-def image_dino_feature(images, node_type="block", layer=-1):
-    global USE_CUDA
-    if USE_CUDA:
-        images = images.cuda()
-    global dinov2
-    feat_extractor = dinov2
-    if USE_CUDA:
-        feat_extractor = feat_extractor.cuda()
-    # attn_outputs, mlp_outputs, block_outputs = [], [], []
-    outputs = []
-    for i in range(images.shape[0]):
-        attn_output, mlp_output, block_output = feat_extractor(
-            images[i].unsqueeze(0)
-        )
-        out_dict = {
-            "attn": attn_output,
-            "mlp": mlp_output,
-            "block": block_output,
-        }
-        out = out_dict[node_type]
-        out = out[layer]
-        outputs.append(out)
-    outputs = torch.cat(outputs, dim=0)
-    outputs = rearrange(outputs[:, 5:, :], "b (h w) c -> b h w c", h=32, w=32)
-    return outputs
-class CLIP(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        from transformers import CLIPProcessor, CLIPModel
-        model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
-        # processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
-        self.model = model.eval()
-        def new_forward(
-            self,
-            hidden_states: torch.Tensor,
-            attention_mask: torch.Tensor,
-            causal_attention_mask: torch.Tensor,
-            output_attentions: Optional[bool] = False,
-        ) -> Tuple[torch.FloatTensor]:
-            residual = hidden_states
-            hidden_states = self.layer_norm1(hidden_states)
-            hidden_states, attn_weights = self.self_attn(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                causal_attention_mask=causal_attention_mask,
-                output_attentions=output_attentions,
-            )
-            hw = np.sqrt(hidden_states.shape[1]-1).astype(int)
-            self.attn_output = rearrange(hidden_states.clone()[:, 1:], "b (h w) c -> b h w c", h=hw)
-            hidden_states = residual + hidden_states
-            residual = hidden_states
-            hidden_states = self.layer_norm2(hidden_states)
-            hidden_states = self.mlp(hidden_states)
-            self.mlp_output = rearrange(hidden_states.clone()[:, 1:], "b (h w) c -> b h w c", h=hw)
-            hidden_states = residual + hidden_states
-            outputs = (hidden_states,)
-            if output_attentions:
-                outputs += (attn_weights,)
-            self.block_output = rearrange(hidden_states.clone()[:, 1:], "b (h w) c -> b h w c", h=hw)
-            return outputs
-        setattr(self.model.vision_model.encoder.layers[0].__class__, "forward", new_forward)
-    @torch.no_grad()
-    def forward(self, x):
-        out = self.model.vision_model(x)
-        attn_outputs, mlp_outputs, block_outputs = [], [], []
-        for i, blk in enumerate(self.model.vision_model.encoder.layers):
-            attn_outputs.append(blk.attn_output)
-            mlp_outputs.append(blk.mlp_output)
-            block_outputs.append(blk.block_output)
-        attn_outputs = torch.stack(attn_outputs)
-        mlp_outputs = torch.stack(mlp_outputs)
-        block_outputs = torch.stack(block_outputs)
-        return attn_outputs, mlp_outputs, block_outputs
-clip = CLIP()
-def image_clip_feature(
-    images, node_type="block", layer=-1
-):
-    global USE_CUDA
-    if USE_CUDA:
-        images = images.cuda()
-    global clip
-    feat_extractor = clip
-    if USE_CUDA:
-        feat_extractor = feat_extractor.cuda()
-    # attn_outputs, mlp_outputs, block_outputs = [], [], []
-    outputs = []
-    for i in range(images.shape[0]):
-        attn_output, mlp_output, block_output = feat_extractor(
-            images[i].unsqueeze(0)
-        )
-        out_dict = {
-            "attn": attn_output,
-            "mlp": mlp_output,
-            "block": block_output,
-        }
-        out = out_dict[node_type]
-        out = out[layer]
-        outputs.append(out)
-    outputs = torch.cat(outputs, dim=0)
-    return outputs
-import hashlib
-import pickle
-import sys
-from collections import OrderedDict
-# Cache dictionary with limited size
-class LimitedSizeCache(OrderedDict):
-    def __init__(self, max_size_bytes):
-        self.max_size_bytes = max_size_bytes
-        self.current_size_bytes = 0
-        super().__init__()
-    def __setitem__(self, key, value):
-        item_size = self.get_item_size(value)
-        # Evict items until there is enough space
-        while self.current_size_bytes + item_size > self.max_size_bytes:
-            self.popitem(last=False)
-        super().__setitem__(key, value)
-        self.current_size_bytes += item_size
-    def __delitem__(self, key):
-        value = self[key]
-        super().__delitem__(key)
-        self.current_size_bytes -= self.get_item_size(value)
-    def get_item_size(self, value):
-        """Estimate the size of the value in bytes."""
-        return sys.getsizeof(value)
-# Initialize the cache with a 4GB limit
-cache = LimitedSizeCache(max_size_bytes=4 * 1024 * 1024 * 1024)  # 4GB
-def compute_hash(*args, **kwargs):
-    """Compute a unique hash based on the function arguments."""
-    hasher = hashlib.sha256()
-    pickled_args = pickle.dumps((args, kwargs))
-    hasher.update(pickled_args)
-    return hasher.hexdigest()
-def run_model_on_image(images, model_name="sam", node_type="block", layer=-1):
-    global USE_CUDA
-    USE_CUDA = True
-    if model_name == "SAM(sam_vit_b)":
-        if not USE_CUDA:
-            gr.warning("GPU not detected. Running SAM on CPU, ~30s/image.")
-        result = image_sam_feature(images, node_type=node_type, layer=layer)
-    elif model_name == 'MobileSAM':
-        result = image_mobilesam_feature(images, node_type=node_type, layer=layer)
-    elif model_name == "DiNO(dinov2_vitb14_reg)":
-        result = image_dino_feature(images, node_type=node_type, layer=layer)
-    elif model_name == "CLIP(openai/clip-vit-base-patch16)":
-        result = image_clip_feature(images, node_type=node_type, layer=layer)
-    else:
-        raise ValueError(f"Model {model_name} not supported.")
-    return result
-def extract_features(images, model_name="MobileSAM", node_type="block", layer=-1):
-    resolution_dict = {
-        "MobileSAM": (1024, 1024),
-        "SAM(sam_vit_b)": (1024, 1024),
-        "DiNO(dinov2_vitb14_reg)": (448, 448),
-        "CLIP(openai/clip-vit-base-patch16)": (224, 224),
-    }
-    images = transform_images(images, resolution=resolution_dict[model_name])
-    # Compute the cache key
-    cache_key = compute_hash(images, model_name, node_type, layer)
-    # Check if the result is already in the cache
-    if cache_key in cache:
-        print("Cache hit!")
-        return cache[cache_key]
-    result = run_model_on_image(images, model_name=model_name, node_type=node_type, layer=layer)
-    # Store the result in the cache
-    cache[cache_key] = result
-    return result
 def compute_ncut(
     features,
@@ -540,18 +21,17 @@ def compute_ncut(
     knn_ncut=10,
     knn_tsne=10,
     embedding_method="UMAP",
-    num_sample_tsne=1000,
-    perplexity=500,
-    n_neighbors=500,
     min_dist=0.1,
 ):
-    from ncut_pytorch import NCUT, rgb_from_tsne_3d, rgb_from_umap_3d
     start = time.time()
     eigvecs, eigvals = NCUT(
         num_eig=num_eig,
         num_sample=num_sample_ncut,
-        device="cuda" if USE_CUDA else "cpu",
         affinity_focal_gamma=affinity_focal_gamma,
         knn=knn_ncut,
     ).fit_transform(features.reshape(-1, features.shape[-1]))
@@ -563,6 +43,7 @@ def compute_ncut(
             eigvecs,
             n_neighbors=n_neighbors,
             min_dist=min_dist,
         )
         print(f"UMAP time: {time.time() - start:.2f}s")
     elif embedding_method == "t-SNE":
@@ -571,6 +52,7 @@ def compute_ncut(
             num_sample=num_sample_tsne,
             perplexity=perplexity,
             knn=knn_tsne,
         )
         print(f"t-SNE time: {time.time() - start:.2f}s")
     else:
@@ -613,12 +95,16 @@ def main_fn(
     n_neighbors=500,
     min_dist=0.1,
 ):
-    if perplexity >= num_sample_tsne:
         # raise gr.Error("Perplexity must be less than the number of samples for t-SNE.")
-        gr.Warning("Perplexity must be less than the number of samples for t-SNE.\n" f"Setting perplexity to {num_sample_tsne-1}.")
         perplexity = num_sample_tsne - 1
-    images = [image[0] for image in images]
     start = time.time()
     features = extract_features(
@@ -645,29 +131,57 @@ def main_fn(
 default_images = ['./images/image_0.jpg', './images/image_1.jpg', './images/image_2.jpg', './images/image_3.jpg', './images/image_5.jpg']
 default_outputs = ['./images/ncut_0.jpg', './images/ncut_1.jpg', './images/ncut_2.jpg', './images/ncut_3.jpg', './images/ncut_5.jpg']
 with gr.Blocks() as demo:
-    gr.Markdown('## Upload Images here 👇')
-    gr.Interface(
         main_fn,
-        [
-            gr.Gallery(value=default_images, label="Select images", show_label=False, elem_id="images", columns=[3], rows=[1], object_fit="contain", height="auto", type="pil", show_share_button=False),
-            gr.Dropdown(["SAM(sam_vit_b)", "MobileSAM", "DiNO(dinov2_vitb14_reg)", "CLIP(openai/clip-vit-base-patch16)"], label="Model", value="SAM(sam_vit_b)", elem_id="model_name"),
-            gr.Slider(0, 11, step=1, label="Layer", value=11, elem_id="layer", info="which layer of the image backbone features"),
-            gr.Slider(1, 1000, step=1, label="Number of eigenvectors", value=100, elem_id="num_eig", info='increase for more object parts, decrease for whole object'),
         ],
-        gr.Gallery(value=default_outputs, label="NCUT Embedding", show_label=False, elem_id="ncut", columns=[3], rows=[1], object_fit="contain", height="auto"),
-        additional_inputs=[
-            gr.Dropdown(["attn", "mlp", "block"], label="Node type", value="block", elem_id="node_type", info="attn: attention output, mlp: mlp output, block: sum of residual stream"),
-            gr.Slider(0.01, 1, step=0.01, label="Affinity focal gamma", value=0.3, elem_id="affinity_focal_gamma", info="decrease for more aggressive cleaning on the affinity matrix"),
-            gr.Slider(100, 50000, step=100, label="num_sample (NCUT)", value=10000, elem_id="num_sample_ncut", info="Nyström approximation"),
-            gr.Slider(1, 100, step=1, label="KNN (NCUT)", value=10, elem_id="knn_ncut", info="Nyström approximation"),
-            gr.Dropdown(["t-SNE", "UMAP"], label="Embedding method", value="t-SNE", elem_id="embedding_method"),
-            gr.Slider(100, 1000, step=100, label="num_sample (t-SNE/UMAP)", value=300, elem_id="num_sample_tsne", info="Nyström approximation"),
-            gr.Slider(1, 100, step=1, label="KNN (t-SNE/UMAP)", value=10, elem_id="knn_tsne", info="Nyström approximation"),
-            gr.Slider(10, 500, step=10, label="Perplexity (t-SNE)", value=150, elem_id="perplexity"),
-            gr.Slider(10, 500, step=10, label="n_neighbors (UMAP)", value=150, elem_id="n_neighbors"),
-            gr.Slider(0.1, 1, step=0.1, label="min_dist (UMAP)", value=0.1, elem_id="min_dist"),
-        ]
     )
-demo.launch()

+# %%
+import gradio as gr
 import torch
 from PIL import Image
 import numpy as np
 import time
 import gradio as gr
+from backbone import extract_features
+from ncut_pytorch import NCUT, rgb_from_tsne_3d, rgb_from_umap_3d
+import spaces
 def compute_ncut(
     features,
     knn_ncut=10,
     knn_tsne=10,
     embedding_method="UMAP",
+    num_sample_tsne=300,
+    perplexity=150,
+    n_neighbors=150,
     min_dist=0.1,
 ):
     start = time.time()
     eigvecs, eigvals = NCUT(
         num_eig=num_eig,
         num_sample=num_sample_ncut,
+        device="cuda" if torch.cuda.is_available() else "cpu",
         affinity_focal_gamma=affinity_focal_gamma,
         knn=knn_ncut,
     ).fit_transform(features.reshape(-1, features.shape[-1]))
             eigvecs,
             n_neighbors=n_neighbors,
             min_dist=min_dist,
+            device="cuda" if torch.cuda.is_available() else "cpu",
         )
         print(f"UMAP time: {time.time() - start:.2f}s")
     elif embedding_method == "t-SNE":
             num_sample=num_sample_tsne,
             perplexity=perplexity,
             knn=knn_tsne,
+            device="cuda" if torch.cuda.is_available() else "cpu",
         )
         print(f"t-SNE time: {time.time() - start:.2f}s")
     else:
     n_neighbors=500,
     min_dist=0.1,
 ):
+    if perplexity >= num_sample_tsne or n_neighbors >= num_sample_tsne:
         # raise gr.Error("Perplexity must be less than the number of samples for t-SNE.")
+        gr.Warning("Perplexity/n_neighbors must be less than the number of samples.\n" f"Setting to {num_sample_tsne-1}.")
         perplexity = num_sample_tsne - 1
+        n_neighbors = num_sample_tsne - 1
+    node_type = node_type.split(":")[0].strip()
+    images = [image[0] for image in images]     # remove the label
     start = time.time()
     features = extract_features(
 default_images = ['./images/image_0.jpg', './images/image_1.jpg', './images/image_2.jpg', './images/image_3.jpg', './images/image_5.jpg']
 default_outputs = ['./images/ncut_0.jpg', './images/ncut_1.jpg', './images/ncut_2.jpg', './images/ncut_3.jpg', './images/ncut_5.jpg']
+downscaled_images = ['./images/image_0_small.jpg', './images/image_1_small.jpg', './images/image_2_small.jpg', './images/image_3_small.jpg', './images/image_5_small.jpg']
+downscaled_outputs = ['./images/ncut_0_small.jpg', './images/ncut_1_small.jpg', './images/ncut_2_small.jpg', './images/ncut_3_small.jpg', './images/ncut_5_small.jpg']
 with gr.Blocks() as demo:
+    with gr.Row():
+        with gr.Column(scale=5, min_width=200):
+            input_gallery = gr.Gallery(value=[], label="Select images", show_label=False, elem_id="images", columns=[3], rows=[1], object_fit="contain", height="auto", type="pil", show_share_button=False)
+            submit_button = gr.Button("🔴Submit", elem_id="submit_button")
+            clear_images_button = gr.Button("🗑️Clear Images")
+            gr.Markdown('### Load Examples 👇')
+            load_images_button = gr.Button("Load", elem_id="load-images-button")
+            gr.Gallery(value=downscaled_images[:3] + downscaled_outputs[:3], label="Example Set A", show_label=False, columns=[3], rows=[2], object_fit="scale-down", height="200px", show_share_button=False)
+        with gr.Column(scale=5, min_width=200):
+            output_gallery = gr.Gallery(value=[], label="NCUT Embedding", show_label=False, elem_id="ncut", columns=[3], rows=[1], object_fit="contain", height="auto")
+            model_dropdown = gr.Dropdown(["SAM(sam_vit_b)", "MobileSAM", "DiNO(dinov2_vitb14_reg)", "CLIP(openai/clip-vit-base-patch16)"], label="Model", value="SAM(sam_vit_b)", elem_id="model_name")
+            layer_slider = gr.Slider(0, 11, step=1, label="Layer", value=11, elem_id="layer")
+            num_eig_slider = gr.Slider(1, 1000, step=1, label="Number of eigenvectors", value=100, elem_id="num_eig", info='increase for more clusters')
+            affinity_focal_gamma_slider = gr.Slider(0.01, 1, step=0.01, label="Affinity focal gamma", value=0.3, elem_id="affinity_focal_gamma", info="decrease for shaper NCUT")
+            with gr.Accordion("Additional Parameters", open=False):
+                node_type_dropdown = gr.Dropdown(["attn: attention output", "mlp: mlp output", "block: sum of residual"], label="Node type", value="block: sum of residual", elem_id="node_type", info="which feature to take from each layer?")
+                num_sample_ncut_slider = gr.Slider(100, 50000, step=100, label="num_sample (NCUT)", value=10000, elem_id="num_sample_ncut", info="Nyström approximation")
+                knn_ncut_slider = gr.Slider(1, 100, step=1, label="KNN (NCUT)", value=10, elem_id="knn_ncut", info="Nyström approximation")
+                embedding_method_dropdown = gr.Dropdown(["t-SNE", "UMAP"], label="Embedding method", value="t-SNE", elem_id="embedding_method")
+                num_sample_tsne_slider = gr.Slider(100, 1000, step=100, label="num_sample (t-SNE/UMAP)", value=300, elem_id="num_sample_tsne", info="Nyström approximation")
+                knn_tsne_slider = gr.Slider(1, 100, step=1, label="KNN (t-SNE/UMAP)", value=10, elem_id="knn_tsne", info="Nyström approximation")
+                perplexity_slider = gr.Slider(10, 500, step=10, label="Perplexity (t-SNE)", value=150, elem_id="perplexity")
+                n_neighbors_slider = gr.Slider(10, 500, step=10, label="n_neighbors (UMAP)", value=150, elem_id="n_neighbors")
+                min_dist_slider = gr.Slider(0.1, 1, step=0.1, label="min_dist (UMAP)", value=0.1, elem_id="min_dist")
+    def load_default_images():
+        return default_images, default_outputs
+    def empty_input_and_output():
+        return [], []
+    load_images_button.click(load_default_images, outputs=[input_gallery, output_gallery])
+    clear_images_button.click(empty_input_and_output, outputs=[input_gallery, output_gallery])
+    submit_button.click(
         main_fn,
+        inputs=[
+            input_gallery, model_dropdown, layer_slider, num_eig_slider, node_type_dropdown,
+            affinity_focal_gamma_slider, num_sample_ncut_slider, knn_ncut_slider,
+            embedding_method_dropdown, num_sample_tsne_slider, knn_tsne_slider,
+            perplexity_slider, n_neighbors_slider, min_dist_slider
         ],
+        outputs=output_gallery
     )
+demo.launch(share=True)

backbone.py ADDED Viewed

	@@ -0,0 +1,394 @@

+from typing import Optional, Tuple
+from einops import rearrange
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from torch import nn
+import numpy as np
+import os
+import time
+import gradio as gr
+MODEL_DICT = {}
+def transform_images(images, resolution=(1024, 1024)):
+    images = [image.convert("RGB").resize(resolution) for image in images]
+    # Convert to torch tensor
+    images = [
+        torch.tensor(np.array(image).transpose(2, 0, 1)).float() / 255
+        for image in images
+    ]
+    # Normalize
+    images = [(image - 0.5) / 0.5 for image in images]
+    images = torch.stack(images)
+    return images
+class MobileSAM(nn.Module):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        from mobile_sam import sam_model_registry
+        url = "https://raw.githubusercontent.com/ChaoningZhang/MobileSAM/master/weights/mobile_sam.pt"
+        model_type = "vit_t"
+        sam_checkpoint = "mobile_sam.pt"
+        if not os.path.exists(sam_checkpoint):
+            import requests
+            r = requests.get(url)
+            with open(sam_checkpoint, "wb") as f:
+                f.write(r.content)
+        mobile_sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
+        def new_forward_fn(self, x):
+            shortcut = x
+            x = self.conv1(x)
+            x = self.act1(x)
+            x = self.conv2(x)
+            x = self.act2(x)
+            self.attn_output = rearrange(x.clone(), "b c h w -> b h w c")
+            x = self.conv3(x)
+            self.mlp_output = rearrange(x.clone(), "b c h w -> b h w c")
+            x = self.drop_path(x)
+            x += shortcut
+            x = self.act3(x)
+            self.block_output = rearrange(x.clone(), "b c h w -> b h w c")
+            return x
+        setattr(
+            mobile_sam.image_encoder.layers[0].blocks[0].__class__,
+            "forward",
+            new_forward_fn,
+        )
+        def new_forward_fn2(self, x):
+            H, W = self.input_resolution
+            B, L, C = x.shape
+            assert L == H * W, "input feature has wrong size"
+            res_x = x
+            if H == self.window_size and W == self.window_size:
+                x = self.attn(x)
+            else:
+                x = x.view(B, H, W, C)
+                pad_b = (self.window_size - H % self.window_size) % self.window_size
+                pad_r = (self.window_size - W % self.window_size) % self.window_size
+                padding = pad_b > 0 or pad_r > 0
+                if padding:
+                    x = F.pad(x, (0, 0, 0, pad_r, 0, pad_b))
+                pH, pW = H + pad_b, W + pad_r
+                nH = pH // self.window_size
+                nW = pW // self.window_size
+                # window partition
+                x = (
+                    x.view(B, nH, self.window_size, nW, self.window_size, C)
+                    .transpose(2, 3)
+                    .reshape(B * nH * nW, self.window_size * self.window_size, C)
+                )
+                x = self.attn(x)
+                # window reverse
+                x = (
+                    x.view(B, nH, nW, self.window_size, self.window_size, C)
+                    .transpose(2, 3)
+                    .reshape(B, pH, pW, C)
+                )
+                if padding:
+                    x = x[:, :H, :W].contiguous()
+                x = x.view(B, L, C)
+            hw = np.sqrt(x.shape[1]).astype(int)
+            self.attn_output = rearrange(x.clone(), "b (h w) c -> b h w c", h=hw)
+            x = res_x + self.drop_path(x)
+            x = x.transpose(1, 2).reshape(B, C, H, W)
+            x = self.local_conv(x)
+            x = x.view(B, C, L).transpose(1, 2)
+            mlp_output = self.mlp(x)
+            self.mlp_output = rearrange(
+                mlp_output.clone(), "b (h w) c -> b h w c", h=hw
+            )
+            x = x + self.drop_path(mlp_output)
+            self.block_output = rearrange(x.clone(), "b (h w) c -> b h w c", h=hw)
+            return x
+        setattr(
+            mobile_sam.image_encoder.layers[1].blocks[0].__class__,
+            "forward",
+            new_forward_fn2,
+        )
+        mobile_sam.eval()
+        self.image_encoder = mobile_sam.image_encoder
+    @torch.no_grad()
+    def forward(self, x):
+        with torch.no_grad():
+            x = torch.nn.functional.interpolate(x, size=(1024, 1024), mode="bilinear")
+        out = self.image_encoder(x)
+        attn_outputs, mlp_outputs, block_outputs = [], [], []
+        for i_layer in range(len(self.image_encoder.layers)):
+            for i_block in range(len(self.image_encoder.layers[i_layer].blocks)):
+                blk = self.image_encoder.layers[i_layer].blocks[i_block]
+                attn_outputs.append(blk.attn_output)
+                mlp_outputs.append(blk.mlp_output)
+                block_outputs.append(blk.block_output)
+        return attn_outputs, mlp_outputs, block_outputs
+MODEL_DICT["MobileSAM"] = MobileSAM()
+class SAM(torch.nn.Module):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        from segment_anything import sam_model_registry, SamPredictor
+        from segment_anything.modeling.sam import Sam
+        checkpoint = "sam_vit_b_01ec64.pth"
+        if not os.path.exists(checkpoint):
+            checkpoint_url = (
+                "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth"
+            )
+            import requests
+            r = requests.get(checkpoint_url)
+            with open(checkpoint, "wb") as f:
+                f.write(r.content)
+        sam: Sam = sam_model_registry["vit_b"](checkpoint=checkpoint)
+        from segment_anything.modeling.image_encoder import (
+            window_partition,
+            window_unpartition,
+        )
+        def new_block_forward(self, x: torch.Tensor) -> torch.Tensor:
+            shortcut = x
+            x = self.norm1(x)
+            # Window partition
+            if self.window_size > 0:
+                H, W = x.shape[1], x.shape[2]
+                x, pad_hw = window_partition(x, self.window_size)
+            x = self.attn(x)
+            # Reverse window partition
+            if self.window_size > 0:
+                x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+            self.attn_output = x.clone()
+            x = shortcut + x
+            mlp_outout = self.mlp(self.norm2(x))
+            self.mlp_output = mlp_outout.clone()
+            x = x + mlp_outout
+            self.block_output = x.clone()
+            return x
+        setattr(sam.image_encoder.blocks[0].__class__, "forward", new_block_forward)
+        self.image_encoder = sam.image_encoder
+        self.image_encoder.eval()
+    @torch.no_grad()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        with torch.no_grad():
+            x = torch.nn.functional.interpolate(x, size=(1024, 1024), mode="bilinear")
+        out = self.image_encoder(x)
+        attn_outputs, mlp_outputs, block_outputs = [], [], []
+        for i, blk in enumerate(self.image_encoder.blocks):
+            attn_outputs.append(blk.attn_output)
+            mlp_outputs.append(blk.mlp_output)
+            block_outputs.append(blk.block_output)
+        attn_outputs = torch.stack(attn_outputs)
+        mlp_outputs = torch.stack(mlp_outputs)
+        block_outputs = torch.stack(block_outputs)
+        return attn_outputs, mlp_outputs, block_outputs
+MODEL_DICT["SAM(sam_vit_b)"] = SAM()
+class DiNOv2(torch.nn.Module):
+    def __init__(self, ver="dinov2_vitb14_reg"):
+        super().__init__()
+        self.dinov2 = torch.hub.load("facebookresearch/dinov2", ver)
+        self.dinov2.requires_grad_(False)
+        self.dinov2.eval()
+        def new_block_forward(self, x: torch.Tensor) -> torch.Tensor:
+            def attn_residual_func(x):
+                return self.ls1(self.attn(self.norm1(x)))
+            def ffn_residual_func(x):
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_output = attn_residual_func(x)
+            hw = np.sqrt(attn_output.shape[1] - 5).astype(int)
+            self.attn_output = rearrange(
+                attn_output.clone()[:, 5:], "b (h w) c -> b h w c", h=hw
+            )
+            x = x + attn_output
+            mlp_output = ffn_residual_func(x)
+            self.mlp_output = rearrange(
+                mlp_output.clone()[:, 5:], "b (h w) c -> b h w c", h=hw
+            )
+            x = x + mlp_output
+            block_output = x
+            self.block_output = rearrange(
+                block_output.clone()[:, 5:], "b (h w) c -> b h w c", h=hw
+            )
+            return x
+        setattr(self.dinov2.blocks[0].__class__, "forward", new_block_forward)
+    @torch.no_grad()
+    def forward(self, x):
+        out = self.dinov2(x)
+        attn_outputs, mlp_outputs, block_outputs = [], [], []
+        for i, blk in enumerate(self.dinov2.blocks):
+            attn_outputs.append(blk.attn_output)
+            mlp_outputs.append(blk.mlp_output)
+            block_outputs.append(blk.block_output)
+        attn_outputs = torch.stack(attn_outputs)
+        mlp_outputs = torch.stack(mlp_outputs)
+        block_outputs = torch.stack(block_outputs)
+        return attn_outputs, mlp_outputs, block_outputs
+MODEL_DICT["DiNO(dinov2_vitb14_reg)"] = DiNOv2()
+class CLIP(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        from transformers import CLIPProcessor, CLIPModel
+        model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
+        # processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
+        self.model = model.eval()
+        def new_forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: torch.Tensor,
+            causal_attention_mask: torch.Tensor,
+            output_attentions: Optional[bool] = False,
+        ) -> Tuple[torch.FloatTensor]:
+            residual = hidden_states
+            hidden_states = self.layer_norm1(hidden_states)
+            hidden_states, attn_weights = self.self_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                causal_attention_mask=causal_attention_mask,
+                output_attentions=output_attentions,
+            )
+            hw = np.sqrt(hidden_states.shape[1] - 1).astype(int)
+            self.attn_output = rearrange(
+                hidden_states.clone()[:, 1:], "b (h w) c -> b h w c", h=hw
+            )
+            hidden_states = residual + hidden_states
+            residual = hidden_states
+            hidden_states = self.layer_norm2(hidden_states)
+            hidden_states = self.mlp(hidden_states)
+            self.mlp_output = rearrange(
+                hidden_states.clone()[:, 1:], "b (h w) c -> b h w c", h=hw
+            )
+            hidden_states = residual + hidden_states
+            outputs = (hidden_states,)
+            if output_attentions:
+                outputs += (attn_weights,)
+            self.block_output = rearrange(
+                hidden_states.clone()[:, 1:], "b (h w) c -> b h w c", h=hw
+            )
+            return outputs
+        setattr(
+            self.model.vision_model.encoder.layers[0].__class__, "forward", new_forward
+        )
+    @torch.no_grad()
+    def forward(self, x):
+        out = self.model.vision_model(x)
+        attn_outputs, mlp_outputs, block_outputs = [], [], []
+        for i, blk in enumerate(self.model.vision_model.encoder.layers):
+            attn_outputs.append(blk.attn_output)
+            mlp_outputs.append(blk.mlp_output)
+            block_outputs.append(blk.block_output)
+        attn_outputs = torch.stack(attn_outputs)
+        mlp_outputs = torch.stack(mlp_outputs)
+        block_outputs = torch.stack(block_outputs)
+        return attn_outputs, mlp_outputs, block_outputs
+MODEL_DICT["CLIP(openai/clip-vit-base-patch16)"] = CLIP()
+def extract_features(images, model_name, node_type, layer):
+    resolution_dict = {
+        "MobileSAM": (1024, 1024),
+        "SAM(sam_vit_b)": (1024, 1024),
+        "DiNO(dinov2_vitb14_reg)": (448, 448),
+        "CLIP(openai/clip-vit-base-patch16)": (224, 224),
+    }
+    images = transform_images(images, resolution=resolution_dict[model_name])
+    model = MODEL_DICT[model_name]
+    use_cuda = torch.cuda.is_available()
+    if use_cuda:
+        model = model.cuda()
+    outputs = []
+    for i in range(images.shape[0]):
+        inp = images[i].unsqueeze(0)
+        if use_cuda:
+            inp = inp.cuda()
+        attn_output, mlp_output, block_output = model(inp)
+        out_dict = {
+            "attn": attn_output,
+            "mlp": mlp_output,
+            "block": block_output,
+        }
+        out = out_dict[node_type]
+        out = out[layer]
+        outputs.append(out)
+    outputs = torch.cat(outputs, dim=0)
+    return outputs