Grounding_DINO3.0

Running

App Files Files Community

Grounding_DINO3.0 / groundingdino /util /utils.py

ShilongLiu

upadate demos

3f75218 over 1 year ago

raw

history blame

17.7 kB

	import argparse
	import json
	import warnings
	from collections import OrderedDict
	from copy import deepcopy
	from typing import Any, Dict, List

	import numpy as np
	import torch
	from transformers import AutoTokenizer

	from groundingdino.util.slconfig import SLConfig


	def slprint(x, name="x"):
	if isinstance(x, (torch.Tensor, np.ndarray)):
	print(f"{name}.shape:", x.shape)
	elif isinstance(x, (tuple, list)):
	print("type x:", type(x))
	for i in range(min(10, len(x))):
	slprint(x[i], f"{name}[{i}]")
	elif isinstance(x, dict):
	for k, v in x.items():
	slprint(v, f"{name}[{k}]")
	else:
	print(f"{name}.type:", type(x))


	def clean_state_dict(state_dict):
	new_state_dict = OrderedDict()
	for k, v in state_dict.items():
	if k[:7] == "module.":
	k = k[7:] # remove `module.`
	new_state_dict[k] = v
	return new_state_dict


	def renorm(
	img: torch.FloatTensor, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
	) -> torch.FloatTensor:
	# img: tensor(3,H,W) or tensor(B,3,H,W)
	# return: same as img
	assert img.dim() == 3 or img.dim() == 4, "img.dim() should be 3 or 4 but %d" % img.dim()
	if img.dim() == 3:
	assert img.size(0) == 3, 'img.size(0) shoule be 3 but "%d". (%s)' % (
	img.size(0),
	str(img.size()),
	)
	img_perm = img.permute(1, 2, 0)
	mean = torch.Tensor(mean)
	std = torch.Tensor(std)
	img_res = img_perm * std + mean
	return img_res.permute(2, 0, 1)
	else: # img.dim() == 4
	assert img.size(1) == 3, 'img.size(1) shoule be 3 but "%d". (%s)' % (
	img.size(1),
	str(img.size()),
	)
	img_perm = img.permute(0, 2, 3, 1)
	mean = torch.Tensor(mean)
	std = torch.Tensor(std)
	img_res = img_perm * std + mean
	return img_res.permute(0, 3, 1, 2)


	class CocoClassMapper:
	def __init__(self) -> None:
	self.category_map_str = {
	"1": 1,
	"2": 2,
	"3": 3,
	"4": 4,
	"5": 5,
	"6": 6,
	"7": 7,
	"8": 8,
	"9": 9,
	"10": 10,
	"11": 11,
	"13": 12,
	"14": 13,
	"15": 14,
	"16": 15,
	"17": 16,
	"18": 17,
	"19": 18,
	"20": 19,
	"21": 20,
	"22": 21,
	"23": 22,
	"24": 23,
	"25": 24,
	"27": 25,
	"28": 26,
	"31": 27,
	"32": 28,
	"33": 29,
	"34": 30,
	"35": 31,
	"36": 32,
	"37": 33,
	"38": 34,
	"39": 35,
	"40": 36,
	"41": 37,
	"42": 38,
	"43": 39,
	"44": 40,
	"46": 41,
	"47": 42,
	"48": 43,
	"49": 44,
	"50": 45,
	"51": 46,
	"52": 47,
	"53": 48,
	"54": 49,
	"55": 50,
	"56": 51,
	"57": 52,
	"58": 53,
	"59": 54,
	"60": 55,
	"61": 56,
	"62": 57,
	"63": 58,
	"64": 59,
	"65": 60,
	"67": 61,
	"70": 62,
	"72": 63,
	"73": 64,
	"74": 65,
	"75": 66,
	"76": 67,
	"77": 68,
	"78": 69,
	"79": 70,
	"80": 71,
	"81": 72,
	"82": 73,
	"84": 74,
	"85": 75,
	"86": 76,
	"87": 77,
	"88": 78,
	"89": 79,
	"90": 80,
	}
	self.origin2compact_mapper = {int(k): v - 1 for k, v in self.category_map_str.items()}
	self.compact2origin_mapper = {int(v - 1): int(k) for k, v in self.category_map_str.items()}

	def origin2compact(self, idx):
	return self.origin2compact_mapper[int(idx)]

	def compact2origin(self, idx):
	return self.compact2origin_mapper[int(idx)]


	def to_device(item, device):
	if isinstance(item, torch.Tensor):
	return item.to(device)
	elif isinstance(item, list):
	return [to_device(i, device) for i in item]
	elif isinstance(item, dict):
	return {k: to_device(v, device) for k, v in item.items()}
	else:
	raise NotImplementedError(
	"Call Shilong if you use other containers! type: {}".format(type(item))
	)


	#
	def get_gaussian_mean(x, axis, other_axis, softmax=True):
	"""

	Args:
	x (float): Input images(BxCxHxW)
	axis (int): The index for weighted mean
	other_axis (int): The other index

	Returns: weighted index for axis, BxC

	"""
	mat2line = torch.sum(x, axis=other_axis)
	# mat2line = mat2line / mat2line.mean() * 10
	if softmax:
	u = torch.softmax(mat2line, axis=2)
	else:
	u = mat2line / (mat2line.sum(2, keepdim=True) + 1e-6)
	size = x.shape[axis]
	ind = torch.linspace(0, 1, size).to(x.device)
	batch = x.shape[0]
	channel = x.shape[1]
	index = ind.repeat([batch, channel, 1])
	mean_position = torch.sum(index * u, dim=2)
	return mean_position


	def get_expected_points_from_map(hm, softmax=True):
	"""get_gaussian_map_from_points
	B,C,H,W -> B,N,2 float(0, 1) float(0, 1)
	softargmax function

	Args:
	hm (float): Input images(BxCxHxW)

	Returns:
	weighted index for axis, BxCx2. float between 0 and 1.

	"""
	# hm = 10*hm
	B, C, H, W = hm.shape
	y_mean = get_gaussian_mean(hm, 2, 3, softmax=softmax) # B,C
	x_mean = get_gaussian_mean(hm, 3, 2, softmax=softmax) # B,C
	# return torch.cat((x_mean.unsqueeze(-1), y_mean.unsqueeze(-1)), 2)
	return torch.stack([x_mean, y_mean], dim=2)


	# Positional encoding (section 5.1)
	# borrow from nerf
	class Embedder:
	def __init__(self, **kwargs):
	self.kwargs = kwargs
	self.create_embedding_fn()

	def create_embedding_fn(self):
	embed_fns = []
	d = self.kwargs["input_dims"]
	out_dim = 0
	if self.kwargs["include_input"]:
	embed_fns.append(lambda x: x)
	out_dim += d

	max_freq = self.kwargs["max_freq_log2"]
	N_freqs = self.kwargs["num_freqs"]

	if self.kwargs["log_sampling"]:
	freq_bands = 2.0 ** torch.linspace(0.0, max_freq, steps=N_freqs)
	else:
	freq_bands = torch.linspace(2.00.0, 2.0max_freq, steps=N_freqs)

	for freq in freq_bands:
	for p_fn in self.kwargs["periodic_fns"]:
	embed_fns.append(lambda x, p_fn=p_fn, freq=freq: p_fn(x * freq))
	out_dim += d

	self.embed_fns = embed_fns
	self.out_dim = out_dim

	def embed(self, inputs):
	return torch.cat([fn(inputs) for fn in self.embed_fns], -1)


	def get_embedder(multires, i=0):
	import torch.nn as nn

	if i == -1:
	return nn.Identity(), 3

	embed_kwargs = {
	"include_input": True,
	"input_dims": 3,
	"max_freq_log2": multires - 1,
	"num_freqs": multires,
	"log_sampling": True,
	"periodic_fns": [torch.sin, torch.cos],
	}

	embedder_obj = Embedder(**embed_kwargs)
	embed = lambda x, eo=embedder_obj: eo.embed(x)
	return embed, embedder_obj.out_dim


	class APOPMeter:
	def __init__(self) -> None:
	self.tp = 0
	self.fp = 0
	self.tn = 0
	self.fn = 0

	def update(self, pred, gt):
	"""
	Input:
	pred, gt: Tensor()
	"""
	assert pred.shape == gt.shape
	self.tp += torch.logical_and(pred == 1, gt == 1).sum().item()
	self.fp += torch.logical_and(pred == 1, gt == 0).sum().item()
	self.tn += torch.logical_and(pred == 0, gt == 0).sum().item()
	self.tn += torch.logical_and(pred == 1, gt == 0).sum().item()

	def update_cm(self, tp, fp, tn, fn):
	self.tp += tp
	self.fp += fp
	self.tn += tn
	self.tn += fn


	def inverse_sigmoid(x, eps=1e-5):
	x = x.clamp(min=0, max=1)
	x1 = x.clamp(min=eps)
	x2 = (1 - x).clamp(min=eps)
	return torch.log(x1 / x2)


	def get_raw_dict(args):
	"""
	return the dicf contained in args.

	e.g:
	>>> with open(path, 'w') as f:
	json.dump(get_raw_dict(args), f, indent=2)
	"""
	if isinstance(args, argparse.Namespace):
	return vars(args)
	elif isinstance(args, dict):
	return args
	elif isinstance(args, SLConfig):
	return args._cfg_dict
	else:
	raise NotImplementedError("Unknown type {}".format(type(args)))


	def stat_tensors(tensor):
	assert tensor.dim() == 1
	tensor_sm = tensor.softmax(0)
	entropy = (tensor_sm * torch.log(tensor_sm + 1e-9)).sum()

	return {
	"max": tensor.max(),
	"min": tensor.min(),
	"mean": tensor.mean(),
	"var": tensor.var(),
	"std": tensor.var() ** 0.5,
	"entropy": entropy,
	}


	class NiceRepr:
	"""Inherit from this class and define ``__nice__`` to "nicely" print your
	objects.

	Defines ``__str__`` and ``__repr__`` in terms of ``__nice__`` function
	Classes that inherit from :class:`NiceRepr` should redefine ``__nice__``.
	If the inheriting class has a ``__len__``, method then the default
	``__nice__`` method will return its length.

	Example:
	>>> class Foo(NiceRepr):
	... def __nice__(self):
	... return 'info'
	>>> foo = Foo()
	>>> assert str(foo) == '<Foo(info)>'
	>>> assert repr(foo).startswith('<Foo(info) at ')

	Example:
	>>> class Bar(NiceRepr):
	... pass
	>>> bar = Bar()
	>>> import pytest
	>>> with pytest.warns(None) as record:
	>>> assert 'object at' in str(bar)
	>>> assert 'object at' in repr(bar)

	Example:
	>>> class Baz(NiceRepr):
	... def __len__(self):
	... return 5
	>>> baz = Baz()
	>>> assert str(baz) == '<Baz(5)>'
	"""

	def __nice__(self):
	"""str: a "nice" summary string describing this module"""
	if hasattr(self, "__len__"):
	# It is a common pattern for objects to use __len__ in __nice__
	# As a convenience we define a default __nice__ for these objects
	return str(len(self))
	else:
	# In all other cases force the subclass to overload __nice__
	raise NotImplementedError(f"Define the __nice__ method for {self.__class__!r}")

	def __repr__(self):
	"""str: the string of the module"""
	try:
	nice = self.__nice__()
	classname = self.__class__.__name__
	return f"<{classname}({nice}) at {hex(id(self))}>"
	except NotImplementedError as ex:
	warnings.warn(str(ex), category=RuntimeWarning)
	return object.__repr__(self)

	def __str__(self):
	"""str: the string of the module"""
	try:
	classname = self.__class__.__name__
	nice = self.__nice__()
	return f"<{classname}({nice})>"
	except NotImplementedError as ex:
	warnings.warn(str(ex), category=RuntimeWarning)
	return object.__repr__(self)


	def ensure_rng(rng=None):
	"""Coerces input into a random number generator.

	If the input is None, then a global random state is returned.

	If the input is a numeric value, then that is used as a seed to construct a
	random state. Otherwise the input is returned as-is.

	Adapted from [1]_.

	Args:
	rng (int \| numpy.random.RandomState \| None):
	if None, then defaults to the global rng. Otherwise this can be an
	integer or a RandomState class
	Returns:
	(numpy.random.RandomState) : rng -
	a numpy random number generator

	References:
	.. [1] https://gitlab.kitware.com/computer-vision/kwarray/blob/master/kwarray/util_random.py#L270 # noqa: E501
	"""

	if rng is None:
	rng = np.random.mtrand._rand
	elif isinstance(rng, int):
	rng = np.random.RandomState(rng)
	else:
	rng = rng
	return rng


	def random_boxes(num=1, scale=1, rng=None):
	"""Simple version of ``kwimage.Boxes.random``

	Returns:
	Tensor: shape (n, 4) in x1, y1, x2, y2 format.

	References:
	https://gitlab.kitware.com/computer-vision/kwimage/blob/master/kwimage/structs/boxes.py#L1390

	Example:
	>>> num = 3
	>>> scale = 512
	>>> rng = 0
	>>> boxes = random_boxes(num, scale, rng)
	>>> print(boxes)
	tensor([[280.9925, 278.9802, 308.6148, 366.1769],
	[216.9113, 330.6978, 224.0446, 456.5878],
	[405.3632, 196.3221, 493.3953, 270.7942]])
	"""
	rng = ensure_rng(rng)

	tlbr = rng.rand(num, 4).astype(np.float32)

	tl_x = np.minimum(tlbr[:, 0], tlbr[:, 2])
	tl_y = np.minimum(tlbr[:, 1], tlbr[:, 3])
	br_x = np.maximum(tlbr[:, 0], tlbr[:, 2])
	br_y = np.maximum(tlbr[:, 1], tlbr[:, 3])

	tlbr[:, 0] = tl_x * scale
	tlbr[:, 1] = tl_y * scale
	tlbr[:, 2] = br_x * scale
	tlbr[:, 3] = br_y * scale

	boxes = torch.from_numpy(tlbr)
	return boxes


	class ModelEma(torch.nn.Module):
	def __init__(self, model, decay=0.9997, device=None):
	super(ModelEma, self).__init__()
	# make a copy of the model for accumulating moving average of weights
	self.module = deepcopy(model)
	self.module.eval()

	# import ipdb; ipdb.set_trace()

	self.decay = decay
	self.device = device # perform ema on different device from model if set
	if self.device is not None:
	self.module.to(device=device)

	def _update(self, model, update_fn):
	with torch.no_grad():
	for ema_v, model_v in zip(
	self.module.state_dict().values(), model.state_dict().values()
	):
	if self.device is not None:
	model_v = model_v.to(device=self.device)
	ema_v.copy_(update_fn(ema_v, model_v))

	def update(self, model):
	self._update(model, update_fn=lambda e, m: self.decay * e + (1.0 - self.decay) * m)

	def set(self, model):
	self._update(model, update_fn=lambda e, m: m)


	class BestMetricSingle:
	def __init__(self, init_res=0.0, better="large") -> None:
	self.init_res = init_res
	self.best_res = init_res
	self.best_ep = -1

	self.better = better
	assert better in ["large", "small"]

	def isbetter(self, new_res, old_res):
	if self.better == "large":
	return new_res > old_res
	if self.better == "small":
	return new_res < old_res

	def update(self, new_res, ep):
	if self.isbetter(new_res, self.best_res):
	self.best_res = new_res
	self.best_ep = ep
	return True
	return False

	def __str__(self) -> str:
	return "best_res: {}\t best_ep: {}".format(self.best_res, self.best_ep)

	def __repr__(self) -> str:
	return self.__str__()

	def summary(self) -> dict:
	return {
	"best_res": self.best_res,
	"best_ep": self.best_ep,
	}


	class BestMetricHolder:
	def __init__(self, init_res=0.0, better="large", use_ema=False) -> None:
	self.best_all = BestMetricSingle(init_res, better)
	self.use_ema = use_ema
	if use_ema:
	self.best_ema = BestMetricSingle(init_res, better)
	self.best_regular = BestMetricSingle(init_res, better)

	def update(self, new_res, epoch, is_ema=False):
	"""
	return if the results is the best.
	"""
	if not self.use_ema:
	return self.best_all.update(new_res, epoch)
	else:
	if is_ema:
	self.best_ema.update(new_res, epoch)
	return self.best_all.update(new_res, epoch)
	else:
	self.best_regular.update(new_res, epoch)
	return self.best_all.update(new_res, epoch)

	def summary(self):
	if not self.use_ema:
	return self.best_all.summary()

	res = {}
	res.update({f"all_{k}": v for k, v in self.best_all.summary().items()})
	res.update({f"regular_{k}": v for k, v in self.best_regular.summary().items()})
	res.update({f"ema_{k}": v for k, v in self.best_ema.summary().items()})
	return res

	def __repr__(self) -> str:
	return json.dumps(self.summary(), indent=2)

	def __str__(self) -> str:
	return self.__repr__()


	def targets_to(targets: List[Dict[str, Any]], device):
	"""Moves the target dicts to the given device."""
	excluded_keys = [
	"questionId",
	"tokens_positive",
	"strings_positive",
	"tokens",
	"dataset_name",
	"sentence_id",
	"original_img_id",
	"nb_eval",
	"task_id",
	"original_id",
	"token_span",
	"caption",
	"dataset_type",
	]
	return [
	{k: v.to(device) if k not in excluded_keys else v for k, v in t.items()} for t in targets
	]


	def get_phrases_from_posmap(
	posmap: torch.BoolTensor, tokenized: Dict, tokenizer: AutoTokenizer
	):
	assert isinstance(posmap, torch.Tensor), "posmap must be torch.Tensor"
	if posmap.dim() == 1:
	non_zero_idx = posmap.nonzero(as_tuple=True)[0].tolist()
	token_ids = [tokenized["input_ids"][i] for i in non_zero_idx]
	return tokenizer.decode(token_ids)
	else:
	raise NotImplementedError("posmap must be 1-dim")