ncut-pytorch / app.py
huzey's picture
update app.py
9b0549b
raw
history blame
14.7 kB
from typing import Optional, Tuple
from einops import rearrange
import torch
from PIL import Image
import torchvision.transforms as transforms
from torch import nn
import numpy as np
import gradio as gr
class SAM(torch.nn.Module):
def __init__(self, checkpoint="/data/sam_model/sam_vit_b_01ec64.pth", **kwargs):
super().__init__(**kwargs)
from segment_anything import sam_model_registry, SamPredictor
from segment_anything.modeling.sam import Sam
sam: Sam = sam_model_registry["vit_b"](checkpoint=checkpoint)
from segment_anything.modeling.image_encoder import (
window_partition,
window_unpartition,
)
def new_block_forward(self, x: torch.Tensor) -> torch.Tensor:
shortcut = x
x = self.norm1(x)
# Window partition
if self.window_size > 0:
H, W = x.shape[1], x.shape[2]
x, pad_hw = window_partition(x, self.window_size)
x = self.attn(x)
# Reverse window partition
if self.window_size > 0:
x = window_unpartition(x, self.window_size, pad_hw, (H, W))
self.attn_output = x.clone()
x = shortcut + x
mlp_outout = self.mlp(self.norm2(x))
self.mlp_output = mlp_outout.clone()
x = x + mlp_outout
self.block_output = x.clone()
return x
setattr(sam.image_encoder.blocks[0].__class__, "forward", new_block_forward)
self.image_encoder = sam.image_encoder
self.image_encoder.eval()
# self.image_encoder = self.image_encoder.cuda()
@torch.no_grad()
def forward(self, x: torch.Tensor) -> torch.Tensor:
with torch.no_grad():
x = torch.nn.functional.interpolate(x, size=(1024, 1024), mode="bilinear")
out = self.image_encoder(x)
attn_outputs, mlp_outputs, block_outputs = [], [], []
for i, blk in enumerate(self.image_encoder.blocks):
attn_outputs.append(blk.attn_output)
mlp_outputs.append(blk.mlp_output)
block_outputs.append(blk.block_output)
attn_outputs = torch.stack(attn_outputs)
mlp_outputs = torch.stack(mlp_outputs)
block_outputs = torch.stack(block_outputs)
return attn_outputs, mlp_outputs, block_outputs
def image_sam_feature(
images,
resolution=(1024, 1024),
node_type="block",
layer=-1,
):
transform = transforms.Compose(
[
transforms.Resize(resolution),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
checkpoint = "sam_vit_b_01ec64.pth"
if not os.path.exists(checkpoint):
checkpoint_url = 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth'
import requests
r = requests.get(checkpoint_url)
with open(checkpoint, 'wb') as f:
f.write(r.content)
feat_extractor = SAM(checkpoint=checkpoint)
# attn_outputs, mlp_outputs, block_outputs = [], [], []
outputs = []
for i, image in enumerate(images):
torch_image = transform(image)
attn_output, mlp_output, block_output = feat_extractor(
# torch_image.unsqueeze(0).cuda()
torch_image.unsqueeze(0)
)
out_dict = {
"attn": attn_output,
"mlp": mlp_output,
"block": block_output,
}
out = out_dict[node_type]
out = out[layer]
outputs.append(out.cpu())
outputs = torch.cat(outputs, dim=0)
return outputs
class DiNOv2(torch.nn.Module):
def __init__(self, ver="dinov2_vitb14_reg"):
super().__init__()
self.dinov2 = torch.hub.load("facebookresearch/dinov2", ver)
self.dinov2.requires_grad_(False)
self.dinov2.eval()
# self.dinov2 = self.dinov2.cuda()
def new_block_forward(self, x: torch.Tensor) -> torch.Tensor:
def attn_residual_func(x):
return self.ls1(self.attn(self.norm1(x)))
def ffn_residual_func(x):
return self.ls2(self.mlp(self.norm2(x)))
attn_output = attn_residual_func(x)
self.attn_output = attn_output.clone()
x = x + attn_output
mlp_output = ffn_residual_func(x)
self.mlp_output = mlp_output.clone()
x = x + mlp_output
block_output = x
self.block_output = block_output.clone()
return x
setattr(self.dinov2.blocks[0].__class__, "forward", new_block_forward)
@torch.no_grad()
def forward(self, x):
out = self.dinov2(x)
attn_outputs, mlp_outputs, block_outputs = [], [], []
for i, blk in enumerate(self.dinov2.blocks):
attn_outputs.append(blk.attn_output)
mlp_outputs.append(blk.mlp_output)
block_outputs.append(blk.block_output)
attn_outputs = torch.stack(attn_outputs)
mlp_outputs = torch.stack(mlp_outputs)
block_outputs = torch.stack(block_outputs)
return attn_outputs, mlp_outputs, block_outputs
def image_dino_feature(images, resolution=(448, 448), node_type="block", layer=-1):
transform = transforms.Compose(
[
transforms.Resize(resolution),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
feat_extractor = DiNOv2()
outputs = []
for i, image in enumerate(images):
torch_image = transform(image)
attn_output, mlp_output, block_output = feat_extractor(
# torch_image.unsqueeze(0).cuda()
torch_image.unsqueeze(0)
)
out_dict = {
"attn": attn_output,
"mlp": mlp_output,
"block": block_output,
}
out = out_dict[node_type]
out = out[layer]
outputs.append(out.cpu())
outputs = torch.cat(outputs, dim=0)
outputs = rearrange(outputs[:, 5:, :], "b (h w) c -> b h w c", h=32, w=32)
return outputs
class CLIP(torch.nn.Module):
def __init__(self):
super().__init__()
from transformers import CLIPProcessor, CLIPModel
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
# processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
self.model = model.eval()
# self.model = self.model.cuda()
def new_forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
causal_attention_mask: torch.Tensor,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.FloatTensor]:
residual = hidden_states
hidden_states = self.layer_norm1(hidden_states)
hidden_states, attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
causal_attention_mask=causal_attention_mask,
output_attentions=output_attentions,
)
self.attn_output = hidden_states.clone()
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.layer_norm2(hidden_states)
hidden_states = self.mlp(hidden_states)
self.mlp_output = hidden_states.clone()
hidden_states = residual + hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
self.block_output = hidden_states.clone()
return outputs
setattr(self.model.vision_model.encoder.layers[0].__class__, "forward", new_forward)
@torch.no_grad()
def forward(self, x):
out = self.model.vision_model(x)
attn_outputs, mlp_outputs, block_outputs = [], [], []
for i, blk in enumerate(self.model.vision_model.encoder.layers):
attn_outputs.append(blk.attn_output)
mlp_outputs.append(blk.mlp_output)
block_outputs.append(blk.block_output)
attn_outputs = torch.stack(attn_outputs)
mlp_outputs = torch.stack(mlp_outputs)
block_outputs = torch.stack(block_outputs)
return attn_outputs, mlp_outputs, block_outputs
def image_clip_feature(
images, resolution=(224, 224), node_type="block", layer=-1
):
if isinstance(images, list):
assert isinstance(images[0], Image.Image), "Input must be a list of PIL images."
else:
assert isinstance(images, Image.Image), "Input must be a PIL image."
images = [images]
transform = transforms.Compose(
[
transforms.Resize(resolution),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
feat_extractor = CLIP()
outputs = []
for i, image in enumerate(images):
torch_image = transform(image)
attn_output, mlp_output, block_output = feat_extractor(
# torch_image.unsqueeze(0).cuda()
torch_image.unsqueeze(0)
)
out_dict = {
"attn": attn_output,
"mlp": mlp_output,
"block": block_output,
}
out = out_dict[node_type]
out = out[layer]
outputs.append(out.cpu())
outputs = torch.cat(outputs, dim=0)
return outputs
def extract_features(images, model_name="sam", node_type="block", layer=-1):
if model_name == "SAM(sam_vit_b)":
return image_sam_feature(images, node_type=node_type, layer=layer)
elif model_name == "DiNO(dinov2_vitb14_reg)":
return image_dino_feature(images, node_type=node_type, layer=layer)
elif model_name == "CLIP(openai/clip-vit-base-patch16)":
return image_clip_feature(images, node_type=node_type, layer=layer)
else:
raise ValueError(f"Model {model_name} not supported.")
def compute_ncut(
features,
num_eig=100,
num_sample_ncut=10000,
affinity_focal_gamma=0.3,
knn_ncut=10,
knn_tsne=10,
num_sample_tsne=1000,
perplexity=500,
):
from ncut_pytorch import NCUT, rgb_from_tsne_3d
eigvecs, eigvals = NCUT(
num_eig=num_eig,
num_sample=num_sample_ncut,
# device="cuda:0",
affinity_focal_gamma=affinity_focal_gamma,
knn=knn_ncut,
).fit_transform(features.reshape(-1, features.shape[-1]))
X_3d, rgb = rgb_from_tsne_3d(
eigvecs,
num_sample=num_sample_tsne,
perplexity=perplexity,
knn=knn_tsne,
)
rgb = rgb.reshape(features.shape[:3] + (3,))
return rgb
def dont_use_too_much_green(image_rgb):
# make sure the foval 40% of the image is red leading
x1, x2 = int(image_rgb.shape[1] * 0.3), int(image_rgb.shape[1] * 0.7)
y1, y2 = int(image_rgb.shape[2] * 0.3), int(image_rgb.shape[2] * 0.7)
sum_values = image_rgb[:, x1:x2, y1:y2].mean((0, 1, 2))
sorted_indices = sum_values.argsort(descending=True)
image_rgb = image_rgb[:, :, :, sorted_indices]
return image_rgb
def to_pil_images(images):
return [
Image.fromarray((image * 255).cpu().numpy().astype(np.uint8)).resize((256, 256), Image.NEAREST)
for image in images
]
def main_fn(
images,
model_name="SAM(sam_vit_b)",
node_type="block",
layer=-1,
num_eig=100,
affinity_focal_gamma=0.3,
num_sample_ncut=10000,
knn_ncut=10,
num_sample_tsne=1000,
knn_tsne=10,
perplexity=500,
):
if perplexity >= num_sample_tsne:
# raise gr.Error("Perplexity must be less than the number of samples for t-SNE.")
gr.Warning("Perplexity must be less than the number of samples for t-SNE.\n" f"Setting perplexity to {num_sample_tsne-1}.")
perplexity = num_sample_tsne - 1
images = [image[0] for image in images]
features = extract_features(
images, model_name=model_name, node_type=node_type, layer=layer
)
rgb = compute_ncut(
features,
num_eig=num_eig,
num_sample_ncut=num_sample_ncut,
affinity_focal_gamma=affinity_focal_gamma,
knn_ncut=knn_ncut,
knn_tsne=knn_tsne,
num_sample_tsne=num_sample_tsne,
perplexity=perplexity,
)
rgb = dont_use_too_much_green(rgb)
return to_pil_images(rgb)
default_images = ['./images/image_0.jpg', './images/image_1.jpg', './images/image_2.jpg', './images/image_3.jpg', './images/image_4.jpg', './images/image_5.jpg']
default_outputs = ['./images/ncut_0.jpg', './images/ncut_1.jpg', './images/ncut_2.jpg', './images/ncut_3.jpg', './images/ncut_4.jpg', './images/ncut_5.jpg']
demo = gr.Interface(
main_fn,
[
gr.Gallery(value=default_images, label="Select images", show_label=False, elem_id="images", columns=[3], rows=[1], object_fit="contain", height="auto", type="pil"),
gr.Dropdown(["SAM(sam_vit_b)", "DiNO(dinov2_vitb14_reg)", "CLIP(openai/clip-vit-base-patch16"], label="Model", value="SAM(sam_vit_b)", elem_id="model_name"),
gr.Dropdown(["attn", "mlp", "block"], label="Node type", value="block", elem_id="node_type", info="attn: attention output, mlp: mlp output, block: sum of residual stream"),
gr.Slider(0, 11, step=1, label="Layer", value=11, elem_id="layer", info="which layer of the image backbone features"),
gr.Slider(1, 1000, step=1, label="Number of eigenvectors", value=100, elem_id="num_eig", info='increase for more object parts, decrease for whole object'),
gr.Slider(0.01, 1, step=0.01, label="Affinity focal gamma", value=0.3, elem_id="affinity_focal_gamma", info="decrease for more aggressive cleaning on the affinity matrix"),
],
gr.Gallery(value=default_outputs, label="NCUT Embedding", show_label=False, elem_id="ncut", columns=[3], rows=[1], object_fit="contain", height="auto"),
additional_inputs=[
gr.Slider(100, 30000, step=100, label="num_sample (NCUT)", value=10000, elem_id="num_sample_ncut", info="for Nyström approximation"),
gr.Slider(1, 100, step=1, label="KNN (NCUT)", value=10, elem_id="knn_ncut", info="for Nyström approximation"),
gr.Slider(100, 10000, step=100, label="num_sample (t-SNE)", value=1000, elem_id="num_sample_tsne", info="for Nyström approximation. Adding will slow down t-SNE quite a lot"),
gr.Slider(1, 100, step=1, label="KNN (t-SNE)", value=10, elem_id="knn_tsne", info="for Nyström approximation"),
gr.Slider(10, 1000, step=10, label="Perplexity (t-SNE)", value=500, elem_id="perplexity", info="for t-SNE"),
]
)
demo.launch()