from typing import Optional, Tuple from einops import rearrange import torch from PIL import Image import torchvision.transforms as transforms from torch import nn import numpy as np import gradio as gr class SAM(torch.nn.Module): def __init__(self, checkpoint="/data/sam_model/sam_vit_b_01ec64.pth", **kwargs): super().__init__(**kwargs) from segment_anything import sam_model_registry, SamPredictor from segment_anything.modeling.sam import Sam sam: Sam = sam_model_registry["vit_b"](checkpoint=checkpoint) from segment_anything.modeling.image_encoder import ( window_partition, window_unpartition, ) def new_block_forward(self, x: torch.Tensor) -> torch.Tensor: shortcut = x x = self.norm1(x) # Window partition if self.window_size > 0: H, W = x.shape[1], x.shape[2] x, pad_hw = window_partition(x, self.window_size) x = self.attn(x) # Reverse window partition if self.window_size > 0: x = window_unpartition(x, self.window_size, pad_hw, (H, W)) self.attn_output = x.clone() x = shortcut + x mlp_outout = self.mlp(self.norm2(x)) self.mlp_output = mlp_outout.clone() x = x + mlp_outout self.block_output = x.clone() return x setattr(sam.image_encoder.blocks[0].__class__, "forward", new_block_forward) self.image_encoder = sam.image_encoder self.image_encoder.eval() # self.image_encoder = self.image_encoder.cuda() @torch.no_grad() def forward(self, x: torch.Tensor) -> torch.Tensor: with torch.no_grad(): x = torch.nn.functional.interpolate(x, size=(1024, 1024), mode="bilinear") out = self.image_encoder(x) attn_outputs, mlp_outputs, block_outputs = [], [], [] for i, blk in enumerate(self.image_encoder.blocks): attn_outputs.append(blk.attn_output) mlp_outputs.append(blk.mlp_output) block_outputs.append(blk.block_output) attn_outputs = torch.stack(attn_outputs) mlp_outputs = torch.stack(mlp_outputs) block_outputs = torch.stack(block_outputs) return attn_outputs, mlp_outputs, block_outputs def image_sam_feature( images, resolution=(1024, 1024), node_type="block", layer=-1, ): transform = transforms.Compose( [ transforms.Resize(resolution), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ] ) checkpoint = "sam_vit_b_01ec64.pth" if not os.path.exists(checkpoint): checkpoint_url = 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth' import requests r = requests.get(checkpoint_url) with open(checkpoint, 'wb') as f: f.write(r.content) feat_extractor = SAM(checkpoint=checkpoint) # attn_outputs, mlp_outputs, block_outputs = [], [], [] outputs = [] for i, image in enumerate(images): torch_image = transform(image) attn_output, mlp_output, block_output = feat_extractor( # torch_image.unsqueeze(0).cuda() torch_image.unsqueeze(0) ) out_dict = { "attn": attn_output, "mlp": mlp_output, "block": block_output, } out = out_dict[node_type] out = out[layer] outputs.append(out.cpu()) outputs = torch.cat(outputs, dim=0) return outputs class DiNOv2(torch.nn.Module): def __init__(self, ver="dinov2_vitb14_reg"): super().__init__() self.dinov2 = torch.hub.load("facebookresearch/dinov2", ver) self.dinov2.requires_grad_(False) self.dinov2.eval() # self.dinov2 = self.dinov2.cuda() def new_block_forward(self, x: torch.Tensor) -> torch.Tensor: def attn_residual_func(x): return self.ls1(self.attn(self.norm1(x))) def ffn_residual_func(x): return self.ls2(self.mlp(self.norm2(x))) attn_output = attn_residual_func(x) self.attn_output = attn_output.clone() x = x + attn_output mlp_output = ffn_residual_func(x) self.mlp_output = mlp_output.clone() x = x + mlp_output block_output = x self.block_output = block_output.clone() return x setattr(self.dinov2.blocks[0].__class__, "forward", new_block_forward) @torch.no_grad() def forward(self, x): out = self.dinov2(x) attn_outputs, mlp_outputs, block_outputs = [], [], [] for i, blk in enumerate(self.dinov2.blocks): attn_outputs.append(blk.attn_output) mlp_outputs.append(blk.mlp_output) block_outputs.append(blk.block_output) attn_outputs = torch.stack(attn_outputs) mlp_outputs = torch.stack(mlp_outputs) block_outputs = torch.stack(block_outputs) return attn_outputs, mlp_outputs, block_outputs def image_dino_feature(images, resolution=(448, 448), node_type="block", layer=-1): transform = transforms.Compose( [ transforms.Resize(resolution), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ] ) feat_extractor = DiNOv2() outputs = [] for i, image in enumerate(images): torch_image = transform(image) attn_output, mlp_output, block_output = feat_extractor( # torch_image.unsqueeze(0).cuda() torch_image.unsqueeze(0) ) out_dict = { "attn": attn_output, "mlp": mlp_output, "block": block_output, } out = out_dict[node_type] out = out[layer] outputs.append(out.cpu()) outputs = torch.cat(outputs, dim=0) outputs = rearrange(outputs[:, 5:, :], "b (h w) c -> b h w c", h=32, w=32) return outputs class CLIP(torch.nn.Module): def __init__(self): super().__init__() from transformers import CLIPProcessor, CLIPModel model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16") # processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16") self.model = model.eval() # self.model = self.model.cuda() def new_forward( self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, causal_attention_mask: torch.Tensor, output_attentions: Optional[bool] = False, ) -> Tuple[torch.FloatTensor]: residual = hidden_states hidden_states = self.layer_norm1(hidden_states) hidden_states, attn_weights = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, output_attentions=output_attentions, ) self.attn_output = hidden_states.clone() hidden_states = residual + hidden_states residual = hidden_states hidden_states = self.layer_norm2(hidden_states) hidden_states = self.mlp(hidden_states) self.mlp_output = hidden_states.clone() hidden_states = residual + hidden_states outputs = (hidden_states,) if output_attentions: outputs += (attn_weights,) self.block_output = hidden_states.clone() return outputs setattr(self.model.vision_model.encoder.layers[0].__class__, "forward", new_forward) @torch.no_grad() def forward(self, x): out = self.model.vision_model(x) attn_outputs, mlp_outputs, block_outputs = [], [], [] for i, blk in enumerate(self.model.vision_model.encoder.layers): attn_outputs.append(blk.attn_output) mlp_outputs.append(blk.mlp_output) block_outputs.append(blk.block_output) attn_outputs = torch.stack(attn_outputs) mlp_outputs = torch.stack(mlp_outputs) block_outputs = torch.stack(block_outputs) return attn_outputs, mlp_outputs, block_outputs def image_clip_feature( images, resolution=(224, 224), node_type="block", layer=-1 ): if isinstance(images, list): assert isinstance(images[0], Image.Image), "Input must be a list of PIL images." else: assert isinstance(images, Image.Image), "Input must be a PIL image." images = [images] transform = transforms.Compose( [ transforms.Resize(resolution), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ] ) feat_extractor = CLIP() outputs = [] for i, image in enumerate(images): torch_image = transform(image) attn_output, mlp_output, block_output = feat_extractor( # torch_image.unsqueeze(0).cuda() torch_image.unsqueeze(0) ) out_dict = { "attn": attn_output, "mlp": mlp_output, "block": block_output, } out = out_dict[node_type] out = out[layer] outputs.append(out.cpu()) outputs = torch.cat(outputs, dim=0) return outputs def extract_features(images, model_name="sam", node_type="block", layer=-1): if model_name == "SAM(sam_vit_b)": return image_sam_feature(images, node_type=node_type, layer=layer) elif model_name == "DiNO(dinov2_vitb14_reg)": return image_dino_feature(images, node_type=node_type, layer=layer) elif model_name == "CLIP(openai/clip-vit-base-patch16)": return image_clip_feature(images, node_type=node_type, layer=layer) else: raise ValueError(f"Model {model_name} not supported.") def compute_ncut( features, num_eig=100, num_sample_ncut=10000, affinity_focal_gamma=0.3, knn_ncut=10, knn_tsne=10, num_sample_tsne=1000, perplexity=500, ): from ncut_pytorch import NCUT, rgb_from_tsne_3d eigvecs, eigvals = NCUT( num_eig=num_eig, num_sample=num_sample_ncut, # device="cuda:0", affinity_focal_gamma=affinity_focal_gamma, knn=knn_ncut, ).fit_transform(features.reshape(-1, features.shape[-1])) X_3d, rgb = rgb_from_tsne_3d( eigvecs, num_sample=num_sample_tsne, perplexity=perplexity, knn=knn_tsne, ) rgb = rgb.reshape(features.shape[:3] + (3,)) return rgb def dont_use_too_much_green(image_rgb): # make sure the foval 40% of the image is red leading x1, x2 = int(image_rgb.shape[1] * 0.3), int(image_rgb.shape[1] * 0.7) y1, y2 = int(image_rgb.shape[2] * 0.3), int(image_rgb.shape[2] * 0.7) sum_values = image_rgb[:, x1:x2, y1:y2].mean((0, 1, 2)) sorted_indices = sum_values.argsort(descending=True) image_rgb = image_rgb[:, :, :, sorted_indices] return image_rgb def to_pil_images(images): return [ Image.fromarray((image * 255).cpu().numpy().astype(np.uint8)).resize((256, 256), Image.NEAREST) for image in images ] def main_fn( images, model_name="SAM(sam_vit_b)", node_type="block", layer=-1, num_eig=100, affinity_focal_gamma=0.3, num_sample_ncut=10000, knn_ncut=10, num_sample_tsne=1000, knn_tsne=10, perplexity=500, ): if perplexity >= num_sample_tsne: # raise gr.Error("Perplexity must be less than the number of samples for t-SNE.") gr.Warning("Perplexity must be less than the number of samples for t-SNE.\n" f"Setting perplexity to {num_sample_tsne-1}.") perplexity = num_sample_tsne - 1 images = [image[0] for image in images] features = extract_features( images, model_name=model_name, node_type=node_type, layer=layer ) rgb = compute_ncut( features, num_eig=num_eig, num_sample_ncut=num_sample_ncut, affinity_focal_gamma=affinity_focal_gamma, knn_ncut=knn_ncut, knn_tsne=knn_tsne, num_sample_tsne=num_sample_tsne, perplexity=perplexity, ) rgb = dont_use_too_much_green(rgb) return to_pil_images(rgb) default_images = ['./images/image_0.jpg', './images/image_1.jpg', './images/image_2.jpg', './images/image_3.jpg', './images/image_4.jpg', './images/image_5.jpg'] default_outputs = ['./images/ncut_0.jpg', './images/ncut_1.jpg', './images/ncut_2.jpg', './images/ncut_3.jpg', './images/ncut_4.jpg', './images/ncut_5.jpg'] demo = gr.Interface( main_fn, [ gr.Gallery(value=default_images, label="Select images", show_label=False, elem_id="images", columns=[3], rows=[1], object_fit="contain", height="auto", type="pil"), gr.Dropdown(["SAM(sam_vit_b)", "DiNO(dinov2_vitb14_reg)", "CLIP(openai/clip-vit-base-patch16"], label="Model", value="SAM(sam_vit_b)", elem_id="model_name"), gr.Dropdown(["attn", "mlp", "block"], label="Node type", value="block", elem_id="node_type", info="attn: attention output, mlp: mlp output, block: sum of residual stream"), gr.Slider(0, 11, step=1, label="Layer", value=11, elem_id="layer", info="which layer of the image backbone features"), gr.Slider(1, 1000, step=1, label="Number of eigenvectors", value=100, elem_id="num_eig", info='increase for more object parts, decrease for whole object'), gr.Slider(0.01, 1, step=0.01, label="Affinity focal gamma", value=0.3, elem_id="affinity_focal_gamma", info="decrease for more aggressive cleaning on the affinity matrix"), ], gr.Gallery(value=default_outputs, label="NCUT Embedding", show_label=False, elem_id="ncut", columns=[3], rows=[1], object_fit="contain", height="auto"), additional_inputs=[ gr.Slider(100, 30000, step=100, label="num_sample (NCUT)", value=10000, elem_id="num_sample_ncut", info="for Nyström approximation"), gr.Slider(1, 100, step=1, label="KNN (NCUT)", value=10, elem_id="knn_ncut", info="for Nyström approximation"), gr.Slider(100, 10000, step=100, label="num_sample (t-SNE)", value=1000, elem_id="num_sample_tsne", info="for Nyström approximation. Adding will slow down t-SNE quite a lot"), gr.Slider(1, 100, step=1, label="KNN (t-SNE)", value=10, elem_id="knn_tsne", info="for Nyström approximation"), gr.Slider(10, 1000, step=10, label="Perplexity (t-SNE)", value=500, elem_id="perplexity", info="for t-SNE"), ] ) demo.launch()