Spaces:

huzey
/

ncut-pytorch

Running on Zero

App Files Files Community

huzey commited on Aug 27

Commit

c471250

•

1 Parent(s): 86da6bf

update models

Browse files

Files changed (2) hide show

app.py +1 -1
backbone.py +102 -21

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # Author: Huzheng Yang
 # %%
-USE_SPACES = False
 if USE_SPACES:
     import spaces

 # Author: Huzheng Yang
 # %%
+USE_SPACES = True
 if USE_SPACES:
     import spaces

backbone.py CHANGED Viewed

@@ -2,6 +2,7 @@ from typing import Optional, Tuple
 from einops import rearrange
 import torch
 import torch.nn.functional as F
 from PIL import Image
 from torch import nn
 import numpy as np
@@ -13,18 +14,16 @@ import gradio as gr
 MODEL_DICT = {}
-def transform_images(images, resolution=(1024, 1024)):
-    images = [image.convert("RGB").resize(resolution) for image in images]
     # Convert to torch tensor
-    images = [
-        torch.tensor(np.array(image).transpose(2, 0, 1)).float() / 255
-        for image in images
-    ]
     # Normalize
-    images = [(image - 0.5) / 0.5 for image in images]
-    images = torch.stack(images)
-    return images
 class MobileSAM(nn.Module):
     def __init__(self, **kwargs):
@@ -283,7 +282,6 @@ class DiNOv2(torch.nn.Module):
 MODEL_DICT["DiNO(dinov2_vitb14_reg)"] = DiNOv2()
 class CLIP(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -291,6 +289,18 @@ class CLIP(torch.nn.Module):
         from transformers import CLIPProcessor, CLIPModel
         model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
         # processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
         self.model = model.eval()
@@ -360,26 +370,90 @@ class CLIP(torch.nn.Module):
 MODEL_DICT["CLIP(openai/clip-vit-base-patch16)"] = CLIP()
 def extract_features(images, model_name, node_type, layer):
     resolution_dict = {
-        "MobileSAM": (1024, 1024),
-        "SAM(sam_vit_b)": (1024, 1024),
-        "DiNO(dinov2_vitb14_reg)": (448, 448),
-        "CLIP(openai/clip-vit-base-patch16)": (224, 224),
     }
-    images = transform_images(images, resolution=resolution_dict[model_name])
     model = MODEL_DICT[model_name]
-    use_cuda = torch.cuda.is_available()
     if use_cuda:
         model = model.cuda()
     outputs = []
-    for i in range(images.shape[0]):
-        inp = images[i].unsqueeze(0)
-        if use_cuda:
-            inp = inp.cuda()
         attn_output, mlp_output, block_output = model(inp)
         out_dict = {
             "attn": attn_output,
@@ -392,3 +466,10 @@ def extract_features(images, model_name, node_type, layer):
     outputs = torch.cat(outputs, dim=0)
     return outputs

 from einops import rearrange
 import torch
 import torch.nn.functional as F
+import timm
 from PIL import Image
 from torch import nn
 import numpy as np
 MODEL_DICT = {}
+def transform_image(image, resolution=(1024, 1024), use_cuda=False):
+    image = image.convert('RGB').resize(resolution, Image.Resampling.NEAREST)
     # Convert to torch tensor
+    image = torch.tensor(np.array(image).transpose(2, 0, 1)).float()
+    if use_cuda:
+        image = image.cuda()
+    image = image / 255
     # Normalize
+    image = (image - 0.5) / 0.5
+    return image
 class MobileSAM(nn.Module):
     def __init__(self, **kwargs):
 MODEL_DICT["DiNO(dinov2_vitb14_reg)"] = DiNOv2()
 class CLIP(torch.nn.Module):
     def __init__(self):
         super().__init__()
         from transformers import CLIPProcessor, CLIPModel
         model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
+        # resample the patch embeddings to 64x64, take 1024x1024 input
+        embeddings = model.vision_model.embeddings.position_embedding.weight
+        cls_embeddings = embeddings[0]
+        patch_embeddings = embeddings[1:]  # [14*14, 768]
+        patch_embeddings = rearrange(patch_embeddings, "(h w) c -> c h w", h=14)
+        patch_embeddings = F.interpolate(patch_embeddings.unsqueeze(0), size=(64, 64), mode="bilinear", align_corners=False).squeeze(0)
+        patch_embeddings = rearrange(patch_embeddings, "c h w -> (h w) c")
+        embeddings = torch.cat([cls_embeddings.unsqueeze(0), patch_embeddings], dim=0)
+        model.vision_model.embeddings.position_embedding.weight = nn.Parameter(embeddings)
+        model.vision_model.embeddings.position_ids = torch.arange(0, 1+64*64)
         # processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
         self.model = model.eval()
 MODEL_DICT["CLIP(openai/clip-vit-base-patch16)"] = CLIP()
+class MAE(timm.models.vision_transformer.VisionTransformer):
+    def __init__(self, **kwargs):
+        super(MAE, self).__init__(**kwargs)
+        sd = torch.hub.load_state_dict_from_url(
+            "https://dl.fbaipublicfiles.com/mae/pretrain/mae_pretrain_vit_base.pth"
+        )
+        checkpoint_model = sd["model"]
+        state_dict = self.state_dict()
+        for k in ["head.weight", "head.bias"]:
+            if (
+                k in checkpoint_model
+                and checkpoint_model[k].shape != state_dict[k].shape
+            ):
+                print(f"Removing key {k} from pretrained checkpoint")
+                del checkpoint_model[k]
+        # load pre-trained model
+        msg = self.load_state_dict(checkpoint_model, strict=False)
+        print(msg)
+        # resample the patch embeddings to 64x64, take 1024x1024 input
+        pos_embed = self.pos_embed[0]
+        cls_embeddings = pos_embed[0]
+        patch_embeddings = pos_embed[1:]  # [14*14, 768]
+        patch_embeddings = rearrange(patch_embeddings, "(h w) c -> c h w", h=14)
+        patch_embeddings = F.interpolate(patch_embeddings.unsqueeze(0), size=(64, 64), mode="bilinear", align_corners=False).squeeze(0)
+        patch_embeddings = rearrange(patch_embeddings, "c h w -> (h w) c")
+        pos_embed = torch.cat([cls_embeddings.unsqueeze(0), patch_embeddings], dim=0)
+        self.pos_embed = nn.Parameter(pos_embed.unsqueeze(0))
+        self.img_size = (1024, 1024)
+        self.patch_embed.img_size = (1024, 1024)
+        self.requires_grad_(False)
+        self.eval()
+        def forward(self, x):
+            self.saved_attn_node = self.ls1(self.attn(self.norm1(x)))
+            x = x + self.saved_attn_node.clone()
+            self.saved_mlp_node = self.ls2(self.mlp(self.norm2(x)))
+            x = x + self.saved_mlp_node.clone()
+            self.saved_block_output = x.clone()
+            return x
+        setattr(self.blocks[0].__class__, "forward", forward)
+    def forward(self, x):
+        out = super().forward(x)
+        def remove_cls_and_reshape(x):
+            x = x.clone()
+            x = x[:, 1:]
+            hw = np.sqrt(x.shape[1]).astype(int)
+            x = rearrange(x, "b (h w) c -> b h w c", h=hw)
+            return x
+        attn_nodes = [remove_cls_and_reshape(block.saved_attn_node) for block in self.blocks]
+        mlp_nodes = [remove_cls_and_reshape(block.saved_mlp_node) for block in self.blocks]
+        block_outputs = [remove_cls_and_reshape(block.saved_block_output) for block in self.blocks]
+        return attn_nodes, mlp_nodes, block_outputs
+MODEL_DICT["MAE(vit_base)"] = MAE()
 def extract_features(images, model_name, node_type, layer):
+    use_cuda = torch.cuda.is_available()
+    resolution = (1024, 1024)
     resolution_dict = {
+        "DiNO(dinov2_vitb14_reg)": (896, 896),
     }
+    if model_name in resolution_dict:
+        resolution = resolution_dict[model_name]
     model = MODEL_DICT[model_name]
     if use_cuda:
         model = model.cuda()
     outputs = []
+    for i in range(len(images)):
+        image = transform_image(images[i], resolution=resolution, use_cuda=use_cuda)
+        inp = image.unsqueeze(0)
         attn_output, mlp_output, block_output = model(inp)
         out_dict = {
             "attn": attn_output,
     outputs = torch.cat(outputs, dim=0)
     return outputs
+if __name__ == '__main__':
+    inp = torch.rand(1, 3, 1024, 1024)
+    model = MAE()
+    out = model(inp)
+    print(out[0][0].shape, out[0][1].shape, out[0][2].shape)