Upload folder using huggingface_hub

345ee20 verified 5 months ago

41.3 kB

	import numpy as np
	import shutil
	import torch
	import os
	import io
	import copy
	import math
	import logging
	from collections import defaultdict
	import torch.distributed as dist
	from torch.nn import BatchNorm2d
	from torch.utils.checkpoint import checkpoint
	import cv2
	import subprocess
	from PIL import Image
	import core.fp16 as fp16
	from typing import Optional, List
	from torch import Tensor

	import torch._utils
	try:
	torch._utils._rebuild_tensor_v2
	except AttributeError:
	def _rebuild_tensor_v2(storage, storage_offset, size, stride, requires_grad, backward_hooks):
	tensor = torch._utils._rebuild_tensor(storage, storage_offset, size, stride)
	tensor.requires_grad = requires_grad
	tensor._backward_hooks = backward_hooks
	return tensor
	torch._utils._rebuild_tensor_v2 = _rebuild_tensor_v2

	import torch.nn as nn

	cv2.ocl.setUseOpenCL(False)

	class AverageMeter(object):
	"""Computes and stores the average and current value"""
	def __init__(self, length):
	self.length = length
	self.reset()

	def reset(self):
	self.history = []
	self.val = 0
	self.avg = 0

	def empty(self):
	return len(self.history) == 0

	def update(self, val):
	self.history.append(val)
	if self.length > 0 and len(self.history) > self.length:
	del self.history[0]

	self.val = val
	self.avg = np.mean(self.history)


	class AverageMinMaxMeter(object):
	"""Computes and stores the average and current value"""
	def __init__(self, length):
	self.length = length
	self.reset()

	def reset(self):
	self.history = []
	self.val = 0
	self.min = 10000
	self.max = 0
	self.avg = 0

	def empty(self):
	return len(self.history) == 0

	def update(self, val):
	self.history.append(val)
	if self.length > 0 and len(self.history) > self.length:
	del self.history[0]

	self.val = val
	self.avg = np.mean(self.history)
	self.min = min(self.min, val)
	self.max = max(self.max, val)


	def accuracy(output, target, topk=(1,)):
	"""Computes the precision@k for the specified values of k"""
	maxk = max(topk)
	batch_size = target.size(0)

	_, pred = output.topk(maxk, 1, True, True)
	pred = pred.t()
	correct = pred.eq(target.reshape(1, -1).expand_as(pred))

	res = []
	for k in topk:
	correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
	res.append(correct_k.mul_(100.0 / batch_size))
	return res

	def accuracy_multi(output, target):
	pred = (output > 0).float()
	tf = (pred == target).float()
	acc = tf.sum() / output.size(0) / output.size(1) * 100
	return acc

	def save_state(state, path, step):
	path, filename = os.path.split(path)
	assert path != ''
	if not os.path.exists(path):
	os.makedirs(path, exist_ok=True)
	print('saving to {}/{}_iter_{}.pth.tar'.format(path, filename, step))
	try:
	torch.save(state, '{}/{}_iter_{}.pth.tar'.format(path, filename, step))
	except TypeError as e:
	print(f"Full key list: state['state_dict'].keys(): {state['state_dict'].keys()}")
	raise e


	def load_last_iter(path):
	if os.path.isfile(path):
	checkpoint = torch.load(path, map_location='cpu')
	dist.barrier()
	print("=> loaded last_iter={} from {}".format(checkpoint['step'], path))
	dist.barrier()
	return checkpoint['step']
	else:
	raise RuntimeError("=> no checkpoint found at {}".format(path))


	def remove_prefix_string(string, prefix):
	assert string.startswith(prefix), "can not remove prefix."
	return string[len(prefix):]


	def remove_prefix_from_state_dict(state_dict, prefix):
	for old_key in list(state_dict.keys()):
	if old_key.startswith(prefix):
	new_key = remove_prefix_string(old_key, prefix)
	state_dict[new_key] = state_dict.pop(old_key)


	def load_state(path, model, ignore=[], optimizer=None, cuda=False, recover=False,
	remove_prefix=None, strict=False):
	def map_func_cuda(storage, location):
	return storage.cuda()
	def map_func_cpu(storage, location):
	return storage.cpu()
	if cuda:
	map_func = map_func_cuda
	else:
	map_func = map_func_cpu

	if os.path.isfile(path):
	print("=> loading checkpoint '{}'".format(path))
	checkpoint = torch.load(path, map_location=map_func)

	if 'state_dict' in checkpoint.keys():
	pretrained_state_dict_new = checkpoint['state_dict']
	else:
	pretrained_state_dict_new = checkpoint

	pretrained_state_dict = dict()
	for k in list(pretrained_state_dict_new.keys()):
	if '_orig_mod.' in k:
	k_new = k.split('_orig_mod.')[1]
	pretrained_state_dict[k_new] = pretrained_state_dict_new[k]
	else:
	pretrained_state_dict[k] = pretrained_state_dict_new[k]

	if len(ignore) > 0:
	assert optimizer == None

	for k in list(pretrained_state_dict.keys()):
	flag = False
	for prefix in ignore:
	if k.startswith(prefix):
	flag = True
	the_prefix = prefix
	break
	if flag:
	print('ignoring {} (prefix: {})'.format(k, the_prefix))
	del pretrained_state_dict[k]
	if remove_prefix:
	remove_prefix_from_state_dict(pretrained_state_dict, remove_prefix)
	model.load_state_dict(pretrained_state_dict, strict=strict)
	dist.barrier()
	if dist.get_rank() == 0:
	keys1 = set(pretrained_state_dict.keys())
	keys2 = set([k for k,_ in model.named_parameters()])
	not_loaded = keys2 - keys1
	for k in not_loaded:
	print('caution: {} not loaded'.format(k))
	dist.barrier()
	if optimizer != None:
	assert len(ignore) == 0

	#TODO currently a workaround for gpu memory leak
	optimizer.load_state_dict(checkpoint['optimizer'])
	for state in optimizer.state.values():
	for k, v in state.items():
	if isinstance(v, torch.Tensor):
	state[k] = v.cuda()
	else:
	state[k] = v
	print("k: {} do not move to cuda".format(k))

	print("=> loaded checkpoint '{}' (step {})".format(path, checkpoint['step']))
	return checkpoint['step']
	if recover:
	return checkpoint['step']
	else:
	assert False, "=> no checkpoint found at '{}'".format(path)


	def load_state_model(model, state, ginfo):
	if ginfo.task_rank == 0:
	printlog(f'======= loading model state for task {ginfo.task_id} ... =======')

	msg = model.load_state_dict(state, strict=False)

	state_keys = set(state.keys())
	model_keys = set(model.state_dict().keys())
	missing_keys = model_keys - state_keys
	if ginfo.task_rank == 0:
	for k in missing_keys:
	printlog(f'missing key: {k}')
	printlog(f'load msg: {msg}')


	def load_state_optimizer(optimizer, state, ginfo):
	if ginfo.task_rank == 0:
	printlog(f'======= loading optimizer state for task {ginfo.task_id} ... =======')
	optimizer.load_state_dict(state)

	def create_logger(name, log_file, level=logging.INFO):
	l = logging.getLogger(name)
	formatter = logging.Formatter('[%(asctime)s][%(filename)20s][line:%(lineno)4d][%(levelname)8s] %(message)s')
	fh = logging.FileHandler(log_file)
	fh.setFormatter(formatter)
	sh = logging.StreamHandler()
	sh.setFormatter(formatter)
	l.setLevel(level)
	l.addHandler(fh)
	l.addHandler(sh)
	return l

	class IterLRScheduler(object):
	def __init__(self, optimizer, milestones, lr_mults, last_iter=-1):
	assert len(milestones) == len(lr_mults), "{} vs {}".format(len(milestones), len(lr_mults))
	self.milestones = milestones
	self.lr_mults = lr_mults
	if not isinstance(optimizer, torch.optim.Optimizer) and not isinstance(optimizer, fp16.FP16_Optimizer):
	raise TypeError('{} is not an Optimizer'.format(
	type(optimizer).__name__))
	self.optimizer = optimizer
	for i, group in enumerate(optimizer.param_groups):
	if 'lr' not in group:
	raise KeyError("param 'lr' is not specified "
	"in param_groups[{}] when resuming an optimizer".format(i))
	self.last_iter = last_iter

	def _get_lr(self):
	try:
	pos = self.milestones.index(self.last_iter)
	except ValueError:
	return list(map(lambda group: group['lr'], self.optimizer.param_groups))
	except:
	raise Exception('wtf?')
	return list(map(lambda group: group['lr']*self.lr_mults[pos], self.optimizer.param_groups))

	def get_lr(self):
	return list(map(lambda group: group['lr'], self.optimizer.param_groups))

	def step(self, this_iter=None):
	if this_iter is None:
	this_iter = self.last_iter + 1
	self.last_iter = this_iter
	for param_group, lr in zip(self.optimizer.param_groups, self._get_lr()):
	param_group['lr'] = lr

	def reset_bn(module):
	if isinstance(module, BatchNorm2d) or isinstance(module, torch.nn.SyncBatchNorm):
	module.running_mean = torch.zeros_like(module.running_mean)
	module.running_var = torch.ones_like(module.running_var)

	def pil_loader(img_str):
	buff = io.BytesIO(img_str)
	with Image.open(buff) as img:
	img = img.convert('RGB')
	return img

	def cv2_loader(img_str):
	img_array = np.frombuffer(img_str, dtype=np.uint8)
	return cv2.imdecode(img_array, cv2.IMREAD_COLOR)

	def param_groups(model):
	bn_group = []
	fc_group = []
	feature_group = []
	normal_group = []

	bn_names = set()
	for name,m in model.named_modules():
	if isinstance(m, BatchNorm2d) or isinstance(m, torch.nn.SyncBatchNorm):
	if not m.weight is None:
	bn_group.append(m.weight)
	bn_names.add(name+'.weight')
	if not m.bias is None:
	bn_group.append(m.bias)
	bn_names.add(name+'.bias')

	for name,param in model.named_parameters():
	if name in bn_names:
	continue
	elif name.startswith('module.base.fc'):
	feature_group.append(param)
	elif name.startswith('module.logits'):
	fc_group.append(param)
	else:
	normal_group.append(param)

	return bn_group, feature_group, fc_group, normal_group

	def clip_grad_value(parameters, clip_value):
	clip_value = float(clip_value)
	for p in filter(lambda p: p.grad is not None, parameters):
	p.grad.data.clamp_(min=-clip_value, max=clip_value)

	def compute_grad_norm(parameters):
	parameters = list(filter(lambda p: p.grad is not None, parameters))
	total_norm = 0
	for p in parameters:
	param_norm = p.grad.data.norm(2)
	total_norm += param_norm ** 2
	total_norm = total_norm ** 0.5
	return total_norm


	class SIMSELoss(nn.Module):
	def __init__(self):
	super(SIMSELoss, self).__init__()

	def forward(self, pred, real):
	diffs = real - pred
	n = torch.numel(diffs.data)
	mse = torch.sum(diffs.pow(2)) / n
	simse = torch.sum(diffs).pow(2) / (n ** 2)
	return mse - simse

	class GradRejust(torch.autograd.Function):
	@staticmethod
	def forward(ctx, x, grad_scale):
	ctx.grad_scale = grad_scale
	return x.view_as(x)

	@staticmethod
	def backward(ctx, grad_output):
	return ctx.grad_scale * grad_output, None

	def grad_rejust(x, grad_scale=1.0):
	return GradRejust.apply(x, grad_scale)

	def count_parameters_num(model):
	count = 0
	count_fc = 0
	param_dict = {name:param for name,param in model.named_parameters()}
	param_keys = param_dict.keys()
	for m_name, m in model.named_modules():
	if isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, torch.nn.SyncBatchNorm):
	weight_name = m_name + '.weight'
	bias_name = m_name + '.bias'
	if weight_name in param_keys:
	temp_params = param_dict[weight_name]
	count += temp_params.data.nelement()
	if bias_name in param_keys:
	temp_params = param_dict[bias_name]
	count += temp_params.data.nelement()
	elif isinstance(m, nn.Linear):
	weight_name = m_name + '.weight'
	bias_name = m_name + '.bias'
	if weight_name in param_keys:
	temp_params = param_dict[weight_name]
	count_fc += temp_params.data.nelement()
	if bias_name in param_keys:
	temp_params = param_dict[bias_name]
	count_fc += temp_params.data.nelement()
	sync_print('Number of conv/bn params: %.2fM' % (count / 1e6))
	sync_print('Number of linear params: %.2fM' % (count_fc / 1e6))

	def get_gpu_memory_map():
	"""Get the current gpu usage.

	Returns
	-------
	usage: dict
	Keys are device ids as integers.
	Values are memory usage as integers in MB.
	"""
	result = subprocess.check_output(
	[
	'nvidia-smi', '--query-gpu=memory.used',
	'--format=csv,nounits,noheader'
	], encoding='utf-8')
	# Convert lines into a dictionary
	gpu_memory = [int(x) for x in result.strip().split('\n')]
	gpu_memory_map = dict(zip(range(len(gpu_memory)), gpu_memory))
	return gpu_memory_map

	def param_group_no_wd(model):
	pgroup_no_wd = []
	names_no_wd = []
	pgroup_normal = []

	type2num = defaultdict(lambda : 0)
	for name,m in model.named_modules():
	if isinstance(m, torch.nn.Conv2d):
	if m.bias is not None:
	pgroup_no_wd.append(m.bias)
	names_no_wd.append(name+'.bias')
	type2num[m.__class__.__name__+'.bias'] += 1
	elif isinstance(m, torch.nn.Linear):
	if m.bias is not None:
	pgroup_no_wd.append(m.bias)
	names_no_wd.append(name+'.bias')
	type2num[m.__class__.__name__+'.bias'] += 1
	elif isinstance(m, torch.nn.BatchNorm2d) or isinstance(m, torch.nn.BatchNorm1d) or isinstance(m, torch.nn.SyncBatchNorm):
	if m.weight is not None:
	pgroup_no_wd.append(m.weight)
	names_no_wd.append(name+'.weight')
	type2num[m.__class__.__name__+'.weight'] += 1
	if m.bias is not None:
	pgroup_no_wd.append(m.bias)
	names_no_wd.append(name+'.bias')
	type2num[m.__class__.__name__+'.bias'] += 1

	for name,p in model.named_parameters():
	if not name in names_no_wd:
	pgroup_normal.append(p)

	return [{'params': pgroup_normal}, {'params': pgroup_no_wd, 'weight_decay': 0.0}], type2num

	def freeze_bn(model):
	names = []
	for name, m in model.named_modules():
	if isinstance(m, torch.nn.BatchNorm2d) or isinstance(m, torch.nn.SyncBatchNorm):
	m.eval()
	names.append(name)

	return names

	def named_buffers(self, memo=None, prefix=''):
	if memo is None:
	memo = set()
	for name, b in self._buffers.items():
	if b is not None and b not in memo:
	memo.add(b)
	yield prefix + ('.' if prefix else '') + name, b
	for mname, module in self.named_children():
	submodule_prefix = prefix + ('.' if prefix else '') + mname
	for name, b in module.named_buffers(memo, submodule_prefix):
	yield name, b

	def change_tensor_half():
	sync_print('override tensor.half() to preserve task_specific flag')
	# change .half of Tensor
	ori_tensor_half = torch.Tensor.half
	torch.Tensor.ori_half = ori_tensor_half
	def new_half(self, args, *kwargs):
	half_t = self.ori_half(args, *kwargs)
	if hasattr(self, 'task_specific'):
	print('preserving task_specific in .half')
	half_t.task_specific = self.task_specific
	if hasattr(self, 'modality_share'):
	print('preserving modality_share in .half')
	half_t.modality_share = self.modality_share
	if hasattr(self, 'backbone_specific'):
	print('preserving backbone_specific in .half')
	half_t.backbone_specific = self.backbone_specific
	if hasattr(self, 'adapter_specific'):
	print('preserving adapter_specific in .half')
	half_t.adapter_specific = self.adapter_specific
	if hasattr(self, 'neck_specific'):
	print('preserving neck_specific in .half')
	half_t.neck_specific = self.neck_specific
	if hasattr(self, 'decoder_specific'):
	print('preserving decoder_specific in .half')
	half_t.decoder_specific = self.decoder_specific
	if hasattr(self, 'rgb_specific'):
	print('preserving rgb_specific in .half')
	half_t.rgb_specific = self.rgb_specific
	if hasattr(self, 'dense_label_specific'):
	print('preserving dense_label_specific in .half')
	half_t.dense_label_specific = self.dense_label_specific
	if hasattr(self, 'sparse_label_specific'):
	print('preserving sparse_label_specific in .half')
	half_t.sparse_label_specific = self.sparse_label_specific
	if hasattr(self, 'text_specific'):
	print('preserving text_specific in .half')
	half_t.text_specific = self.text_specific
	if hasattr(self, 'video_specific'):
	print('preserving video_specific in .half')
	half_t.video_specific = self.video_specific
	return half_t
	torch.Tensor.half = new_half

	def change_tensor_cuda():
	sync_print('override tensor.cuda() to preserve task_specific flag')
	# change .cuda of Tensor
	ori_tensor_cuda = torch.Tensor.cuda
	torch.Tensor.ori_cuda = ori_tensor_cuda
	def new_cuda(self, args, *kwargs): # could be written as decorator I guess...
	cuda_t = self.ori_cuda(args, *kwargs)
	if hasattr(self, 'task_specific'):
	cuda_t.task_specific = self.task_specific
	if hasattr(self, 'modality_share'):
	cuda_t.modality_share = self.modality_share
	if hasattr(self, 'backbone_specific'):
	cuda_t.backbone_specific = self.backbone_specific
	if hasattr(self, 'adapter_specific'):
	cuda_t.adapter_specific = self.adapter_specific
	if hasattr(self, 'neck_specific'):
	cuda_t.neck_specific = self.neck_specific
	if hasattr(self, 'decoder_specific'):
	cuda_t.decoder_specific = self.decoder_specific
	if hasattr(self, 'rgb_specific'):
	cuda_t.rgb_specific = self.rgb_specific
	if hasattr(self, 'dense_labeling_specific'):
	cuda_t.dense_labeling_specific = self.dense_labeling_specific
	if hasattr(self, 'sparse_labeling_specific'):
	cuda_t.sparse_labeling_specific = self.sparse_labeling_specific
	if hasattr(self, 'text_specific'):
	cuda_t.text_specific = self.text_specific
	if hasattr(self, 'video_specific'):
	cuda_t.video_specific = self.video_specific
	return cuda_t
	torch.Tensor.cuda = new_cuda

	def add_task_specific(m, task_specific):
	for name, param in m.named_parameters():
	param.task_specific = task_specific
	param.backbone_specific = False
	param.neck_specific = False
	param.decoder_specific = False
	if task_specific:
	printlog('add param {} as task_specific'.format(name))

	if not hasattr(torch.nn.Module, 'named_buffers'):
	printlog('registering named_buffers for nn.Module at add_task_specific')
	torch.nn.Module.named_buffers = named_buffers

	#m.cuda() # neccesary for broadcast in DistModule,
	# since buffers are tensors which will be changed after .cuda()
	for name, buffer in m.named_buffers():
	buffer.task_specific = task_specific
	buffer.backbone_specific = False
	buffer.neck_specific = False
	buffer.decoder_specific = False
	if task_specific:
	printlog('add buffer {} as task_specific'.format(name))

	def add_backbone_specific(m, backbone_specific):
	for name, param in m.named_parameters():
	param.task_specific = False
	param.backbone_specific = backbone_specific
	param.neck_specific = False
	param.decoder_specific = False
	if backbone_specific:
	printlog('add param {} as backbone_specific'.format(name))

	if not hasattr(torch.nn.Module, 'named_buffers'):
	printlog('registering named_buffers for nn.Module at add_backbone_specific')
	torch.nn.Module.named_buffers = named_buffers

	#m.cuda() # neccesary for broadcast in DistModule, since buffers are tensors which will be changed after .cuda()
	for name, buffer in m.named_buffers():
	buffer.task_specific = False
	buffer.backbone_specific = backbone_specific
	buffer.neck_specific = False
	buffer.decoder_specific = False
	if backbone_specific:
	printlog('add buffer {} as backbone_specific'.format(name))

	def add_neck_specific(m, neck_specific):
	for name, param in m.named_parameters():
	param.task_specific = False
	param.backbone_specific = False
	param.neck_specific = neck_specific
	param.decoder_specific = False
	if neck_specific:
	printlog('add param {} as neck_specific'.format(name))

	if not hasattr(torch.nn.Module, 'named_buffers'):
	printlog('registering named_buffers for nn.Module at add_neck_specific')
	torch.nn.Module.named_buffers = named_buffers

	#m.cuda() # neccesary for broadcast in DistModule, since buffers are tensors which will be changed after .cuda()
	for name, buffer in m.named_buffers():
	buffer.task_specific = False
	buffer.backbone_specific = False
	buffer.neck_specific = neck_specific
	buffer.decoder_specific = False
	if neck_specific:
	printlog('add buffer {} as neck_specific'.format(name))

	def add_decoder_specific(m, decoder_specific):
	for name, param in m.named_parameters():
	param.task_specific = False
	param.backbone_specific = False
	param.neck_specific = False
	param.decoder_specific = decoder_specific
	if decoder_specific:
	printlog('add param {} as decoder_specific'.format(name))

	if not hasattr(torch.nn.Module, 'named_buffers'):
	printlog('registering named_buffers for nn.Module at add_decoder_specific')
	torch.nn.Module.named_buffers = named_buffers

	#m.cuda() # neccesary for broadcast in DistModule, since buffers are tensors which will be changed after .cuda()
	for name, buffer in m.named_buffers():
	buffer.task_specific = False
	buffer.backbone_specific = False
	buffer.neck_specific = False
	buffer.decoder_specific = decoder_specific
	if decoder_specific:
	printlog('add buffer {} as decoder_specific'.format(name))


	def add_aiov2_decoder_specific(m, decoder_specific, task_sp_list=(), neck_sp_list=(), modality_share_list=()):
	for name, param in m.named_parameters():
	_task_sp_flag = any(name.startswith(sp_name) or name.endswith(sp_name) for sp_name in task_sp_list)
	_neck_sp_flag = any(name.startswith(sp_name) or name.endswith(sp_name) for sp_name in neck_sp_list)
	_modality_share_flag = any(name.startswith(share_name) or name.endswith(share_name) for share_name in modality_share_list)

	param.task_specific = _task_sp_flag
	param.modality_share = False if _task_sp_flag or _neck_sp_flag else _modality_share_flag
	param.backbone_specific = False
	param.rgb_specific = False
	param.dense_labeling_specific = False
	param.text_specific = False
	param.video_specific = False
	param.sparse_labeling_specific = False
	param.decoder_specific = False if _task_sp_flag or _neck_sp_flag or _modality_share_flag else decoder_specific

	if _task_sp_flag:
	printlog('add param {} as task_specific'.format(name))
	elif _neck_sp_flag:
	printlog('add param {} as neck_specific'.format(name))
	elif _modality_share_flag:
	printlog('add param {} as modality_share'.format(name))
	elif decoder_specific:
	printlog('add param {} as decoder_specific'.format(name))

	if not hasattr(torch.nn.Module, 'named_buffers'):
	printlog('registering named_buffers for nn.Module at add_decoder_specific')
	torch.nn.Module.named_buffers = named_buffers

	#m.cuda() # neccesary for broadcast in DistModule, since buffers are tensors which will be changed after .cuda()
	for name, buffer in m.named_buffers():
	_task_sp_flag = any(name.startswith(sp_name) or name.endswith(sp_name) for sp_name in task_sp_list)
	_neck_sp_flag = any(name.startswith(sp_name) or name.endswith(sp_name) for sp_name in neck_sp_list)
	_modality_share_flag = any(name.startswith(share_name) or name.endswith(share_name) for share_name in modality_share_list)

	buffer.task_specific = _task_sp_flag
	buffer.modality_share = False if _task_sp_flag or _neck_sp_flag else _modality_share_flag
	buffer.backbone_specific = False
	buffer.rgb_specific = False
	buffer.dense_labeling_specific = False
	buffer.text_specific = False
	buffer.video_specific = False
	buffer.sparse_labeling_specific = False
	buffer.decoder_specific = False if _task_sp_flag or _neck_sp_flag or _modality_share_flag else decoder_specific
	if _task_sp_flag:
	printlog('add buffer {} as task_specific'.format(name))
	elif _neck_sp_flag:
	printlog('add buffer {} as neck_specific'.format(name))
	elif _modality_share_flag:
	printlog('add buffer {} as modality_share'.format(name))
	elif decoder_specific:
	printlog('add buffer {} as decoder_specific'.format(name))

	def add_aiov2_backbone_specific(m, backbone_specific, task_sp_list=(), neck_sp_list=(), modality_share_list=()):
	for name, param in m.named_parameters():
	_task_sp_flag = any(name.startswith(sp_name) or name.endswith(sp_name) for sp_name in task_sp_list)
	_neck_sp_flag = any(name.startswith(sp_name) or name.endswith(sp_name) for sp_name in neck_sp_list)

	param.task_specific = _task_sp_flag
	param.modality_share = False
	param.backbone_specific = False if _task_sp_flag or _neck_sp_flag else backbone_specific
	param.rgb_specific = False
	param.dense_labeling_specific = False
	param.text_specific = False
	param.video_specific = False
	param.sparse_labeling_specific = False
	param.decoder_specific = False
	if _task_sp_flag:
	printlog('add param {} as task_specific'.format(name))
	elif _neck_sp_flag:
	printlog('add param {} as neck_specific'.format(name))
	elif backbone_specific:
	printlog('add param {} as backbone_specific'.format(name))

	if not hasattr(torch.nn.Module, 'named_buffers'):
	printlog('registering named_buffers for nn.Module at add_backbone_specific')
	torch.nn.Module.named_buffers = named_buffers

	#m.cuda() # neccesary for broadcast in DistModule, since buffers are tensors which will be changed after .cuda()
	for name, buffer in m.named_buffers():
	_task_sp_flag = any(name.startswith(sp_name) or name.endswith(sp_name) for sp_name in task_sp_list)
	_neck_sp_flag = any(name.startswith(sp_name) or name.endswith(sp_name) for sp_name in neck_sp_list)

	buffer.task_specific = _task_sp_flag
	buffer.modality_share = False
	buffer.backbone_specific = False if _task_sp_flag or _neck_sp_flag else backbone_specific
	buffer.rgb_specific = False
	buffer.dense_labeling_specific = False
	buffer.text_specific = False
	buffer.video_specific = False
	buffer.sparse_labeling_specific = False
	buffer.decoder_specific = False
	if _task_sp_flag:
	printlog('add buffer {} as task_specific'.format(name))
	elif _neck_sp_flag:
	printlog('add buffer {} as neck_specific'.format(name))
	elif backbone_specific:
	printlog('add buffer {} as backbone_specific'.format(name))

	def add_aiov2_task_specific(m, task_specific=True):
	for name, param in m.named_parameters():

	param.task_specific = task_specific
	param.modality_share = False
	param.backbone_specific = False
	param.rgb_specific = False
	param.dense_labeling_specific = False
	param.text_specific = False
	param.video_specific = False
	param.sparse_labeling_specific = False
	param.decoder_specific = False

	printlog('add param {} as task_specific'.format(name))

	if not hasattr(torch.nn.Module, 'named_buffers'):
	printlog('registering named_buffers for nn.Module at add_task_specific')
	torch.nn.Module.named_buffers = named_buffers

	#m.cuda() # neccesary for broadcast in DistModule, since buffers are tensors which will be changed after .cuda()
	for name, buffer in m.named_buffers():

	buffer.task_specific = task_specific
	buffer.modality_share = False
	buffer.backbone_specific = False
	buffer.rgb_specific = False
	buffer.dense_labeling_specific = False
	buffer.text_specific = False
	buffer.video_specific = False
	buffer.sparse_labeling_specific = False
	buffer.decoder_specific = False

	printlog('add param {} as task_specific'.format(name))


	def param_specific_setting_with_modality(param, modality, _task_sp_flag, _modality_share_flag, modality_specific):
	param.rgb_specific = False
	param.dense_labeling_specific = False
	param.text_specific = False
	param.video_specific = False
	param.sparse_labeling_specific = False

	if modality == 'rgb':
	param.rgb_specific = False if _task_sp_flag or _modality_share_flag else modality_specific
	elif modality == 'dense_labeling':
	param.dense_labeling_specific = False if _task_sp_flag or _modality_share_flag else modality_specific
	elif modality == 'sparse_labeling':
	param.sparse_labeling_specific = False if _task_sp_flag or _modality_share_flag else modality_specific
	elif modality == 'video':
	param.video_specific = False if _task_sp_flag or _modality_share_flag else modality_specific
	elif modality == 'text':
	param.text_specific = False if _task_sp_flag or _modality_share_flag else modality_specific

	return param


	def add_aiov2_modality_specific(m, modality, modality_specific, task_sp_list=(), modality_share_list=()):
	for name, param in m.named_parameters():
	_task_sp_flag = any(name.startswith(sp_name) or name.endswith(sp_name) for sp_name in task_sp_list)
	_modality_share_flag = any(name.startswith(share_name) or name.endswith(share_name) for share_name in modality_share_list)

	param.task_specific = _task_sp_flag
	param.modality_share = False if _task_sp_flag else _modality_share_flag
	param.backbone_specific = False
	param.decoder_specific = False
	# import pdb;pdb.set_trace()
	param = param_specific_setting_with_modality(param, modality, _task_sp_flag, _modality_share_flag, modality_specific)
	# pdb.set_trace()
	# if name.startswith'rgb':
	# param.adapter_rgb_specific = True
	if _task_sp_flag:
	printlog('add param {} as task_specific'.format(name))
	elif _modality_share_flag:
	printlog('add param {} as modality_share'.format(name))
	elif modality_specific:
	printlog('add param {} as {}_specific'.format(name, modality))

	if not hasattr(torch.nn.Module, 'named_buffers'):
	printlog('registering named_buffers for nn.Module at add_adapter_specific')
	torch.nn.Module.named_buffers = named_buffers

	#m.cuda() # neccesary for broadcast in DistModule, since buffers are tensors which will be changed after .cuda()
	for name, buffer in m.named_buffers():
	_task_sp_flag = any(name.startswith(sp_name) or name.endswith(sp_name) for sp_name in task_sp_list)
	_modality_share_flag = any(name.startswith(share_name) or name.endswith(share_name) for share_name in modality_share_list)

	buffer.task_specific = _task_sp_flag
	buffer.modality_share = False if _task_sp_flag else _modality_share_flag
	buffer.backbone_specific = False
	buffer.decoder_specific = False

	buffer = param_specific_setting_with_modality(buffer, modality, _task_sp_flag, _modality_share_flag, modality_specific)

	if _task_sp_flag:
	printlog('add buffer {} as task_specific'.format(name))
	elif _modality_share_flag:
	printlog('add buffer {} as modality_share'.format(name))
	elif modality_specific:
	printlog('add buffer {} as {}_specific'.format(name, modality))

	def copy_state_dict_cpu(state_dict):
	new_state = {}
	for k,v in state_dict.items():
	new_state[k] = v.cpu()
	return new_state

	def copy_optim_state_dict_cpu(state_dict):
	new_state = {}
	new_state['param_groups'] = copy.deepcopy(state_dict['param_groups'])
	new_state['state'] = {}
	for k,v in state_dict['state'].items():
	new_state['state'][k] = {}
	for name,x in v.items():
	if isinstance(x, torch.Tensor):
	new_state['state'][k][name] = x.cpu()
	else:
	new_state['state'][k][name] = copy.deepcopy(x)
	return new_state

	def copy_optim_state_dict_cpu_fp16(state_dict):
	new_state = {}
	new_state['optimizer_state_dict'] = copy_optim_state_dict_cpu(state_dict['optimizer_state_dict'])
	for k in state_dict.keys():
	if k != 'optimizer_state_dict':
	new_state[k] = copy.deepcopy(state_dict[k])
	return new_state

	def sync_print(args, *kwargs):
	if not dist.is_initialized():
	print(args, *kwargs)
	else:
	rank = dist.get_rank()
	# link.barrier()
	print('sync_print: rank {}, '.format(rank) + ' '.join(args), **kwargs)

	def fully_checkpoint_sequential(functions, segments, input, **kwargs):
	r"""Modified version of torch.utils.checkpoint.checkpoint_sequential for memory efficiency.
	It is assumed that at least one of the inputs have requires_grad=True, so we can checkpoint
	all of the segments at ease.
	Please refer to https://pytorch.org/docs/stable/checkpoint.html#torch.utils.checkpoint.checkpoint_sequential
	for more details.

	-1 -> sqrt chunk checkpoint
	0 -> no checkpoint
	others ->
	"""
	preserve = kwargs.pop('preserve_rng_state', True)
	if kwargs:
	raise ValueError("Unexpected keyword arguments: " + ",".join(arg for arg in kwargs))

	def run_function(start, end, functions):
	def forward(input):
	for j in range(start, end + 1):
	input = functions[j](input)
	return input
	return forward

	if isinstance(functions, torch.nn.Sequential):
	functions = list(functions.children())

	# no checkpoint
	if segments == 0:
	return run_function(0, len(functions) - 1, functions)(input)

	# auto determin the chunksize
	if segments < 0:
	segments = int(math.ceil(len(functions)))

	segments = min(segments, len(functions))
	segment_size = len(functions) // segments
	# the last chunk has to be non-volatile
	end = -1
	for start in range(0, segment_size * (segments - 1), segment_size):
	end = start + segment_size - 1
	input = checkpoint(run_function(start, end, functions), input)
	# preserve_rng_state=preserve)
	return checkpoint(run_function(end + 1, len(functions) - 1, functions), input)#,
	# preserve_rng_state=preserve)

	def printlog(args, *kwargs):
	if not dist.is_initialized():
	print(args, *kwargs)
	else:
	print(f"[rank {dist.get_rank()}]", args, *kwargs)

	def _max_by_axis(the_list):
	# type: (List[List[int]]) -> List[int]
	maxes = the_list[0]
	for sublist in the_list[1:]:
	for index, item in enumerate(sublist):
	maxes[index] = max(maxes[index], item)
	return maxes


	class NestedTensor(object):
	def __init__(self, tensors, mask: Optional[Tensor]):
	self.tensors = tensors
	self.mask = mask

	def to(self, device):
	# type: (Device) -> NestedTensor # noqa
	cast_tensor = self.tensors.to(device)
	mask = self.mask
	if mask is not None:
	assert mask is not None
	cast_mask = mask.to(device)
	else:
	cast_mask = None
	return NestedTensor(cast_tensor, cast_mask)

	def decompose(self):
	return self.tensors, self.mask

	def cuda(self):
	return self.to('cuda')

	def __repr__(self):
	return str(self.tensors)


	def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
	# TODO make this more general
	if tensor_list[0].ndim == 3:
	# TODO make it support different-sized images
	max_size = _max_by_axis([list(img.shape) for img in tensor_list])
	# import pdb;pdb.set_trace()
	## for nested tensor debug
	# max_size = [3, 816, 816]
	# min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
	batch_shape = [len(tensor_list)] + max_size
	b, c, h, w = batch_shape
	dtype = tensor_list[0].dtype
	device = tensor_list[0].device
	tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
	mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
	for img, pad_img, m in zip(tensor_list, tensor, mask):
	pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
	m[: img.shape[1], :img.shape[2]] = False # 0: content, 1: pad
	else:
	raise ValueError('not supported')

	return NestedTensor(tensor, mask)


	def get_num_layer_for_vit(var_name, config):
	if (var_name == "module.backbone_module" or var_name.endswith("prompt_embed_kv")) and config.get('lpe_lr', False): # for PE.
	return config.num_layers - 1
	if var_name in ("module.backbone_module", "module.backbone_module.cls_token", "module.backbone_module.mask_token"):
	return 0
	elif var_name.startswith("module.backbone_module.patch_embed"):
	return 0
	elif var_name.startswith("module.backbone_module") and not (var_name.startswith("module.backbone_module.norm") or
	var_name.startswith("module.backbone_module.ln_pre")):
	layer_id = int(var_name.split('.')[3])
	return layer_id + 1
	else:
	return config.num_layers - 1

	def get_num_layer_for_vit_with_adapter(var_name, var_param_name, config):
	# import pdb;pdb.set_trace()
	if (var_name.startswith("module.adapter_") and var_param_name=='pos_embed' and len(var_name.split('.'))==2) and config.get('lpe_lr', False): # for PE.
	return config.num_layers - 1
	# if var_name in ("module.backbone_module", "module.backbone_module.cls_token", "module.backbone_module.mask_token"):
	# return 0
	elif var_name.startswith("module.adapter_") :
	return 0
	# elif var_name.startswith("module.backbone_module") and not (var_name.startswith("module.backbone_module.norm") or
	# var_name.startswith("module.backbone_module.ln_pre")):
	# layer_id = int(var_name.split('.')[3])
	# return layer_id + 1
	# else:
	# return config.num_layers - 1

	def nested_tensor_from_tensor_list_fix_shape(tensor_list: List[Tensor],max=1333,short=800,idx=None):
	# TODO make this more general
	if tensor_list[0].ndim == 3:
	# TODO make it support different-sized images
	# # for coco, resize to 1333, 800
	_, _h, _w = tensor_list[0].shape
	if _w > _h:
	max_size = [3, short, max]
	else:
	max_size = [3, max, short]
	# min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
	batch_shape = [len(tensor_list)] + max_size
	b, c, h, w = batch_shape
	dtype = tensor_list[0].dtype
	device = tensor_list[0].device
	tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
	mask = torch.ones((b, h, w), dtype=torch.bool, device=device)

	for img, pad_img, m in zip(tensor_list, tensor, mask):
	pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
	m[: img.shape[1], :img.shape[2]] = False

	else:
	raise ValueError('not supported')
	return NestedTensor(tensor, mask)