Spaces:

kxqt
/

Expedit-SAM

Runtime error

App Files Files Community

Expedit-SAM / segment_anything /modeling /hourglass_image_encoder.py

kxqt

fix bugs and add sam time box

1f28384 over 1 year ago

raw

history blame

15.6 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.

	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.

	import math
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.autograd import Variable

	from typing import Optional, Tuple, Type

	from .common import LayerNorm2d, MLPBlock

	from .image_encoder import (
	window_partition,
	window_unpartition,
	add_decomposed_rel_pos,
	ImageEncoderViT,
	Block,
	Attention,
	)


	class TokenClusteringBlock(nn.Module):
	def __init__(self, num_spixels=None, n_iters=5, temperture=0.05, window_size=7):
	super().__init__()
	if isinstance(num_spixels, tuple):
	assert len(num_spixels) == 2
	elif num_spixels is not None:
	x = int(math.sqrt(num_spixels))
	assert x * x == num_spixels
	num_spixels = (x, x)
	self.num_spixels = num_spixels
	self.n_iters = n_iters
	self.temperture = temperture
	assert window_size % 2 == 1
	self.r = window_size // 2

	def calc_init_centroid(self, images, num_spixels_width, num_spixels_height):
	"""
	calculate initial superpixels

	Args:
	images: torch.Tensor
	A Tensor of shape (B, C, H, W)
	spixels_width: int
	initial superpixel width
	spixels_height: int
	initial superpixel height

	Return:
	centroids: torch.Tensor
	A Tensor of shape (B, C, H * W)
	init_label_map: torch.Tensor
	A Tensor of shape (B, H * W)
	num_spixels_width: int
	A number of superpixels in each column
	num_spixels_height: int
	A number of superpixels int each raw
	"""
	batchsize, channels, height, width = images.shape
	device = images.device

	centroids = torch.nn.functional.adaptive_avg_pool2d(
	images, (num_spixels_height, num_spixels_width)
	)

	with torch.no_grad():
	num_spixels = num_spixels_width * num_spixels_height
	labels = (
	torch.arange(num_spixels, device=device)
	.reshape(1, 1, *centroids.shape[-2:])
	.type_as(centroids)
	)
	init_label_map = torch.nn.functional.interpolate(
	labels, size=(height, width), mode="nearest"
	).type_as(centroids)
	init_label_map = init_label_map.repeat(batchsize, 1, 1, 1)

	init_label_map = init_label_map.reshape(batchsize, -1)
	centroids = centroids.reshape(batchsize, channels, -1)

	return centroids, init_label_map

	def forward(self, pixel_features, num_spixels=None):
	if num_spixels is None:
	num_spixels = self.num_spixels
	assert num_spixels is not None
	else:
	if isinstance(num_spixels, tuple):
	assert len(num_spixels) == 2
	else:
	x = int(math.sqrt(num_spixels))
	assert x * x == num_spixels
	num_spixels = (x, x)
	pixel_features = pixel_features.permute(0, 3, 1, 2)
	num_spixels_height, num_spixels_width = num_spixels
	num_spixels = num_spixels_width * num_spixels_height
	spixel_features, init_label_map = self.calc_init_centroid(
	pixel_features, num_spixels_width, num_spixels_height
	)

	device = init_label_map.device
	spixels_number = torch.arange(num_spixels, device=device)[None, :, None]
	relative_labels_widths = init_label_map[:, None] % num_spixels_width - spixels_number % num_spixels_width
	relative_labels_heights = torch.div(init_label_map[:, None], num_spixels_width, rounding_mode='trunc') - torch.div(spixels_number, num_spixels_width, rounding_mode='trunc')
	mask = torch.logical_and(torch.abs(relative_labels_widths) <= self.r, torch.abs(relative_labels_heights) <= self.r)
	mask_dist = (~mask) * 1e16

	pixel_features = pixel_features.reshape(*pixel_features.shape[:2], -1) # (B, C, L)
	permuted_pixel_features = pixel_features.permute(0, 2, 1) # (B, L, C)

	for _ in range(self.n_iters):
	dist_matrix = self.pairwise_dist(pixel_features, spixel_features) # (B, L', L)
	dist_matrix += mask_dist
	affinity_matrix = (-dist_matrix * self.temperture).softmax(1)
	spixel_features = torch.bmm(affinity_matrix.detach(), permuted_pixel_features)
	spixel_features = spixel_features / affinity_matrix.detach().sum(2, keepdim=True).clamp_(min=1e-16)
	spixel_features = spixel_features.permute(0, 2, 1)

	dist_matrix = self.pairwise_dist(pixel_features, spixel_features)
	hard_labels = torch.argmin(dist_matrix, dim=1)

	B, C, _ = spixel_features.shape
	spixel_features = spixel_features.permute(0, 2, 1).reshape(B, num_spixels_height, num_spixels_width, C)
	return spixel_features, hard_labels

	def pairwise_dist(self, f1, f2):
	return ((f1 * f1).sum(dim=1).unsqueeze(1)
	+ (f2 * f2).sum(dim=1).unsqueeze(2)
	- 2 * torch.einsum("bcm, bcn -> bmn", f2, f1))

	def extra_repr(self):
	return f"num_spixels={self.num_spixels}, n_iters={self.n_iters}"


	def naive_unpool(f_regions, region_indices):
	_, _, C = f_regions.shape
	N, L = region_indices.shape
	index = region_indices.view(N, L, 1).expand(N, L, C)
	result = f_regions.gather(1, index)
	return result


	class State:
	def __init__(self, unpooling):
	self.unpooling = unpooling
	self.__updated = False

	@property
	def updated(self):
	return self.__updated

	def get(self, name, default=None):
	return getattr(self, name, default)

	def update_state(self, **states: dict):
	self.__updated = True
	for k, v in states.items():
	setattr(self, k, v)

	def call(self, input: torch.Tensor):
	return self.unpooling(input, self)


	class UnpoolingBase(nn.Module):
	def forward(self, x, state: State):
	if not state.updated:
	return x, False
	return self._forward(x, state)

	def derive_unpooler(self):
	return State(self)


	class NaiveUnpooling(UnpoolingBase):
	def _forward(self, x, state: State):
	return naive_unpool(x, state.hard_labels), False


	class TokenReconstructionBlock(UnpoolingBase):
	def __init__(self, k=3, temperture=0.05):
	super().__init__()

	self.k = k
	self.temperture = temperture

	def _forward(self, x, state: State):
	feat = state.feat_before_pooling
	sfeat = state.feat_after_pooling
	ds = (
	(feat * feat).sum(dim=2).unsqueeze(2)
	+ (sfeat * sfeat).sum(dim=2).unsqueeze(1)
	- 2 * torch.einsum("bnc, bmc -> bnm", feat, sfeat)
	) # distance between features and super-features
	ds[ds < 0] = 0
	weight = torch.exp(-self.temperture * ds)
	if self.k >= 0:
	topk, indices = torch.topk(weight, k=self.k, dim=2)
	mink = torch.min(topk, dim=-1).values
	mink = mink.unsqueeze(-1).repeat(1, 1, weight.shape[-1])
	mask = torch.ge(weight, mink)
	zero = Variable(torch.zeros_like(weight)).to(weight.device)
	attention = torch.where(mask, weight, zero)
	attention = F.normalize(attention, dim=2)
	ret = torch.einsum("bnm, bmc -> bnc", attention, x)

	return ret, False



	class HourglassImageEncoderViT(ImageEncoderViT):
	def __init__(
	self,
	img_size: int = 1024,
	patch_size: int = 16,
	in_chans: int = 3,
	embed_dim: int = 768,
	depth: int = 12,
	num_heads: int = 12,
	mlp_ratio: float = 4.0,
	out_chans: int = 256,
	qkv_bias: bool = True,
	norm_layer: Type[nn.Module] = nn.LayerNorm,
	act_layer: Type[nn.Module] = nn.GELU,
	use_abs_pos: bool = True,
	use_rel_pos: bool = False,
	rel_pos_zero_init: bool = True,
	window_size: int = 0,
	global_attn_indexes: Tuple[int, ...] = (),
	hourglass_clustering_location: int = -1,
	hourglass_num_cluster: int = None,
	hourglass_cluster_iters: int = 5,
	hourglass_temperture: float = 0.01,
	hourglass_cluster_window_size: int = 5,
	hourglass_reconstruction_k: int = 20,
	) -> None:
	"""
	Args:
	img_size (int): Input image size.
	patch_size (int): Patch size.
	in_chans (int): Number of input image channels.
	embed_dim (int): Patch embedding dimension.
	depth (int): Depth of ViT.
	num_heads (int): Number of attention heads in each ViT block.
	mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
	qkv_bias (bool): If True, add a learnable bias to query, key, value.
	norm_layer (nn.Module): Normalization layer.
	act_layer (nn.Module): Activation layer.
	use_abs_pos (bool): If True, use absolute positional embeddings.
	use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
	rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
	window_size (int): Window size for window attention blocks.
	global_attn_indexes (list): Indexes for blocks using global attention.
	"""
	super().__init__(
	img_size=img_size,
	patch_size=patch_size,
	in_chans=in_chans,
	embed_dim=embed_dim,
	depth=depth,
	num_heads=num_heads,
	mlp_ratio=mlp_ratio,
	out_chans=out_chans,
	qkv_bias=qkv_bias,
	norm_layer=norm_layer,
	act_layer=act_layer,
	use_abs_pos=use_abs_pos,
	use_rel_pos=use_rel_pos,
	rel_pos_zero_init=rel_pos_zero_init,
	window_size=window_size,
	global_attn_indexes=global_attn_indexes,
	)

	self.window_size = window_size
	self.ws_new = int(math.sqrt(hourglass_num_cluster))

	self.blocks = nn.ModuleList()
	for i in range(depth):
	block = HourglassBlock(
	dim=embed_dim,
	num_heads=num_heads,
	mlp_ratio=mlp_ratio,
	qkv_bias=qkv_bias,
	norm_layer=norm_layer,
	act_layer=act_layer,
	use_rel_pos=use_rel_pos,
	rel_pos_zero_init=rel_pos_zero_init,
	window_size=(window_size if i < hourglass_clustering_location else self.ws_new) if i not in global_attn_indexes else 0,
	window_size_ckpt=window_size,
	input_size=(img_size // patch_size, img_size // patch_size),
	)
	self.blocks.append(block)

	self.clustering_location = hourglass_clustering_location
	self.token_clustering_block = TokenClusteringBlock(
	num_spixels=hourglass_num_cluster,
	n_iters=hourglass_cluster_iters,
	temperture=hourglass_temperture,
	window_size=hourglass_cluster_window_size,
	)
	self.token_reconstruction_block = TokenReconstructionBlock(
	k=hourglass_reconstruction_k,
	temperture=hourglass_temperture,
	)

	def cluster(self, x, reconstructer):
	# x: B, H, W, C
	H, W = x.shape[1:3]
	x, pad_hw = window_partition(x, self.window_size) # B*Nw, WH, WW, C
	Bnw, _, _, C = x.shape

	reconstructer.update_state(
	feat_before_pooling=x.view(-1, self.window_size * self.window_size, C)
	)
	x, hard_labels = self.token_clustering_block(x) # BHW, Wh, Ww, C
	reconstructer.update_state(hard_labels=hard_labels)
	reconstructer.update_state(feat_after_pooling=x.view(Bnw, -1, C))

	# merge window
	# Reverse window partition
	h = pad_hw[0] // self.window_size * x.shape[1]
	w = pad_hw[1] // self.window_size * x.shape[2]
	x = window_unpartition(x, self.ws_new, (h, w), (h, w))
	# out: B, h, w, C
	return x, pad_hw

	def reconstruct(self, x, H, W, recontructer, pad_hw):
	# x: B, h, w, C
	x, _ = window_partition(x, self.ws_new) # B*Nw, Wh, Ww, C
	Bnw, _, _, C = x.shape
	x = x.view(Bnw, -1, C)

	x, _ = recontructer.call(x) # BNw, WHWW, C

	# merge windows
	x = x.view(-1, self.window_size, self.window_size, C)
	x = window_unpartition(x, self.window_size, pad_hw, (H, W)) # B, H, W, C
	return x


	def forward(self, x: torch.Tensor) -> torch.Tensor:
	x = self.patch_embed(x)
	if self.pos_embed is not None:
	x = x + self.pos_embed

	H, W = x.shape[1], x.shape[2]
	reconstructer = self.token_reconstruction_block.derive_unpooler()
	reconstructer.update_state(hw_shape=(H, W))

	for i, blk in enumerate(self.blocks):
	if i == self.clustering_location:
	x, pad_hw = self.cluster(x, reconstructer)
	x = blk(x)

	x = self.reconstruct(x, H, W, reconstructer, pad_hw)

	x = self.neck(x.permute(0, 3, 1, 2))

	return x


	class HourglassBlock(Block):
	"""Transformer blocks with support of window attention and residual propagation blocks"""

	def __init__(
	self,
	dim: int,
	num_heads: int,
	mlp_ratio: float = 4.0,
	qkv_bias: bool = True,
	norm_layer: Type[nn.Module] = nn.LayerNorm,
	act_layer: Type[nn.Module] = nn.GELU,
	use_rel_pos: bool = False,
	rel_pos_zero_init: bool = True,
	window_size: int = 0,
	input_size: Optional[Tuple[int, int]] = None,
	window_size_ckpt: int = 0,
	) -> None:
	"""
	Args:
	dim (int): Number of input channels.
	num_heads (int): Number of attention heads in each ViT block.
	mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
	qkv_bias (bool): If True, add a learnable bias to query, key, value.
	norm_layer (nn.Module): Normalization layer.
	act_layer (nn.Module): Activation layer.
	use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
	rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
	window_size (int): Window size for window attention blocks. If it equals 0, then
	use global attention.
	input_size (int or None): Input resolution for calculating the relative positional
	parameter size.
	"""
	super(HourglassBlock, self).__init__(
	dim=dim,
	num_heads=num_heads,
	mlp_ratio=mlp_ratio,
	qkv_bias=qkv_bias,
	norm_layer=norm_layer,
	act_layer=act_layer,
	use_rel_pos=use_rel_pos,
	rel_pos_zero_init=rel_pos_zero_init,
	window_size=window_size,
	input_size=input_size,
	)

	self.attn = Attention(
	dim,
	num_heads=num_heads,
	qkv_bias=qkv_bias,
	use_rel_pos=use_rel_pos,
	rel_pos_zero_init=rel_pos_zero_init,
	input_size=input_size if window_size == 0 else (window_size_ckpt, window_size_ckpt),
	)