ForkedHulk2 / core /distributed_utils.py

Upload folder using huggingface_hub

345ee20 verified 5 months ago

72.3 kB

	import os
	import torch
	import collections
	from easydict import EasyDict as edict
	import torch.distributed as dist

	from torch.nn import Module
	from torch.utils.data.sampler import Sampler
	import math
	import numpy as np
	import multiprocessing as mp
	import copy
	import random
	from collections import defaultdict
	from core.utils import named_buffers, sync_print, printlog
	import subprocess
	import socket
	from . import comm_
	import torch.cuda.comm

	class DistModule(torch.nn.Module):
	def __init__(self, module, sync=False, task_grp=None, share_backbone_group=None, \
	share_neck_group=None, share_decoder_group=None, share_adapter_group=None,
	ignore_bcast=None, \
	task_weight=None, task_size=None):
	super(DistModule, self).__init__()
	self.module = module
	self.sync = sync
	self.task_grp = task_grp
	self.share_backbone_group = share_backbone_group
	self.share_neck_group = share_neck_group
	self.share_decoder_group = share_decoder_group
	self.share_adapter_group = share_adapter_group
	self.task_weight = task_weight
	self.task_size = task_size

	if not hasattr(torch.nn.Module, 'named_buffers'):
	printlog('registering named_buffers for nn.Module at DistModule')
	torch.nn.Module.named_buffers = named_buffers

	broadcast_params_multitask(self, self.task_grp, self.share_backbone_group,
	self.share_neck_group, self.share_decoder_group,
	self.share_adapter_group, ignore_bcast)

	assert sync, "Currently, only sync model is supported!"
	if not sync:
	self._grad_accs = {}
	self._reduce_hooks = {}
	self._register_hooks()

	def forward(self, inputs, *kwargs):
	return self.module(inputs, *kwargs)

	def train(self, mode=True):
	super(DistModule, self).train(mode)
	self.module.train(mode)

	def _register_hooks(self):
	for i,(name,p) in enumerate(self.named_parameters()):
	if p.requires_grad:
	p_tmp = p.expand_as(p)
	grad_acc = p_tmp.grad_fn.next_functions[0][0]
	self._reduce_hooks[name] = grad_acc.register_hook(self._make_hook(name, p, i))
	self._grad_accs[name] = grad_acc

	def _make_hook(self, name, p, i):
	if not p.task_specific:
	def hook(*ignore):
	allreduce_async(name, p.grad.data)
	else:
	printlog('{} register hook as task specific'.format(name))
	def hook(*ignore):
	#link.allreduce_async(name, p.grad.data, group_idx=self.task_grp)
	allreduce(p.grad.data, group_idx=self.task_grp)
	return hook

	def reduce_gradients(self, task_specific=False):
	if self.sync:
	if not task_specific:
	if self.task_grp is not None or self.share_backbone_group is not None \
	or self.share_neck_group is not None or self.share_decoder_group is not None:
	for name, param in self.named_parameters():
	if param.grad is None: param.grad = param.data * 0
	if param.task_specific and param.requires_grad:
	allreduce(param.grad.data, group_idx=self.task_grp)
	elif param.backbone_specific and param.requires_grad:
	allreduce(param.grad.data, group_idx=self.share_backbone_group)
	elif param.adapter_specific and param.requires_grad:
	allreduce(param.grad.data, group_idx=self.share_adapter_group)
	elif param.neck_specific and param.requires_grad:
	allreduce(param.grad.data, group_idx=self.share_neck_group)
	elif param.decoder_specific and param.requires_grad:
	allreduce(param.grad.data, group_idx=self.share_decoder_group)
	elif param.requires_grad:
	allreduce(param.grad.data)
	else:
	for param in self.parameters():
	if param.requires_grad and param.grad is not None:
	dist.all_reduce(param.grad.data)
	else:
	for name, param in self.named_parameters():
	if param.requires_grad and param.grad is not None:
	dist.all_reduce(param.grad.data, group_idx=self.task_grp)

	class DistModule_Hulk(torch.nn.Module):

	modality_names =['rgb', 'dense_labeling', 'text']

	def __init__(self, module, sync=False, task_grp=None, share_backbone_group=None,
	share_decoder_group=None, share_rgb_group=None, share_dense_labeling_group=None,
	share_sparse_labeling_group=None, share_text_group=None, share_video_group=None,
	share_modality_group=None,
	ignore_bcast=None, task_weight=None, task_size=None, ):
	super(DistModule_Hulk, self).__init__()
	self.module = module
	self.sync = sync
	self.task_grp = task_grp
	self.share_modality_group = share_modality_group
	self.share_backbone_group = share_backbone_group
	# self.share_neck_group = share_neck_group
	self.share_decoder_group = share_decoder_group
	# self.share_adapter_group = share_adapter_group

	# adapter, neck, output proj are communicated
	self.share_rgb_group = share_rgb_group
	self.share_dense_labeling_group = share_dense_labeling_group
	self.share_sparse_labeling_group = share_sparse_labeling_group
	self.share_text_group = share_text_group
	self.share_video_group = share_video_group

	self.task_weight = task_weight
	self.task_size = task_size

	if not hasattr(torch.nn.Module, 'named_buffers'):
	printlog('registering named_buffers for nn.Module at DistModule')
	torch.nn.Module.named_buffers = named_buffers

	broadcast_params_unihcpv2(self, self.task_grp, self.share_backbone_group,
	self.share_decoder_group, self.share_rgb_group,
	self.share_dense_labeling_group, self.share_sparse_labeling_group,
	self.share_text_group, self.share_video_group,
	ignore_bcast,
	share_modality_group=self.share_modality_group,
	)

	assert sync, "Currently, only sync model is supported!"
	if not sync:
	self._grad_accs = {}
	self._reduce_hooks = {}
	self._register_hooks()

	def forward(self, inputs, *kwargs):
	return self.module(inputs, *kwargs)

	def train(self, mode=True):
	super(DistModule_Hulk, self).train(mode)
	self.module.train(mode)

	def _register_hooks(self):
	for i,(name,p) in enumerate(self.named_parameters()):
	if p.requires_grad:
	p_tmp = p.expand_as(p)
	grad_acc = p_tmp.grad_fn.next_functions[0][0]
	self._reduce_hooks[name] = grad_acc.register_hook(self._make_hook(name, p, i))
	self._grad_accs[name] = grad_acc

	def _make_hook(self, name, p, i):
	if not p.task_specific:
	def hook(*ignore):
	allreduce_async(name, p.grad.data)
	else:
	printlog('{} register hook as task specific'.format(name))
	def hook(*ignore):
	#link.allreduce_async(name, p.grad.data, group_idx=self.task_grp)
	allreduce(p.grad.data, group_idx=self.task_grp)
	return hook


	def reduce_gradients(self, task_specific=False):
	if self.sync:
	if not task_specific:
	if self.task_grp is not None or self.share_backbone_group is not None \
	or self.share_decoder_group is not None or self.share_rgb_group is not None \
	or self.share_dense_labeling_group is not None or self.share_text_group is not None \
	or self.share_sparese_labeling_group is not None or self.share_video_group is not None\
	or self.share_modality_group is not None:
	for name, param in self.named_parameters():
	if param.grad is None: param.grad = param.data * 0
	if param.task_specific and param.requires_grad:
	allreduce(param.grad.data, group_idx=self.task_grp)
	elif param.modality_share and param.requires_grad:
	allreduce(param.grad.data, group_idx=self.share_modality_group)
	elif param.backbone_specific and param.requires_grad:
	allreduce(param.grad.data, group_idx=self.share_backbone_group)
	elif param.rgb_specific and param.requires_grad:
	allreduce(param.grad.data, group_idx=self.share_rgb_group)
	elif param.dense_labeling_specific and param.requires_grad:
	allreduce(param.grad.data, group_idx=self.share_dense_labeling_group)
	elif param.sparse_labeling_specific and param.requires_grad:
	allreduce(param.grad.data, group_idx=self.share_sparse_labeling_group)
	elif param.text_specific and param.requires_grad:
	allreduce(param.grad.data, group_idx=self.share_text_group)
	elif param.video_specific and param.requires_grad:
	allreduce(param.grad.data, group_idx=self.share_video_group)
	elif param.decoder_specific and param.requires_grad:
	allreduce(param.grad.data, group_idx=self.share_decoder_group)
	elif param.requires_grad:
	allreduce(param.grad.data)
	else:
	for param in self.parameters():
	if param.requires_grad and param.grad is not None:
	dist.all_reduce(param.grad.data)
	else:
	for name, param in self.named_parameters():
	if param.requires_grad and param.grad is not None:
	dist.all_reduce(param.grad.data, group_idx=self.task_grp)

	def allreduce(x, group_idx=None, ):
	if group_idx == 0:
	group_idx = None
	return dist.all_reduce(x, group=group_idx)

	def allreduce_async(name, x, group_idx=None, ):
	if group_idx == 0:
	group_idx = None
	return dist.all_reduce(x, group=group_idx)

	def broadcast_params(model, task_grp, ignore):
	""" broadcast model parameters """
	if task_grp is not None:
	for name,p in model.named_parameters():
	if ignore and name in ignore:
	printlog('param {} ignored in broadcast'.format(name))
	continue
	try:
	if not p.task_specific:
	broadcast(p, 0)
	else:
	printlog('broadcasting task-specific param {}'.format(name))
	broadcast(p, 0, group_idx=task_grp)
	except:
	raise RuntimeError('param {} does not have task_specific'.format(name))

	for name,b in model.named_buffers():
	if ignore and name in ignore:
	printlog('buffer {} ignored in broadcast'.format(name))
	continue
	try:
	if not b.task_specific:
	broadcast(b, 0)
	else:
	printlog('broadcasting task-specific buffer {}'.format(name))
	broadcast(b, 0, group_idx=task_grp)
	except:
	raise RuntimeError('buffer {} does not have task_specific'.format(name, id(b)))
	else:
	for name,p in model.named_parameters():
	if ignore and name in ignore:
	printlog('param {} ignored in broadcast'.format(name))
	continue
	try:
	broadcast(p, 0)
	except:
	raise RuntimeError('param {} does not have task_specific'.format(name))

	for name,b in model.named_buffers():
	if ignore and name in ignore:
	printlog('buffer {} ignored in broadcast'.format(name))
	continue
	try:
	broadcast(b, 0)
	except:
	raise RuntimeError('buffer {} does not have task_specific'.format(name, id(b)))

	def broadcast(x, root, group_idx=None):
	if group_idx == 0:
	group_idx = None
	elif group_idx is not None:
	return group_idx.broadcast(x, 0)
	return dist.broadcast(x, root, group_idx)


	def broadcast_params_multitask(model, task_grp, share_backbone_group, share_neck_group, share_decoder_group,
	share_adapter_group, ignore):
	""" broadcast multi-task model parameters """
	if task_grp is not None or share_backbone_group is not None \
	or share_neck_group is not None or share_decoder_group is None or share_adapter_group is not None:

	for name,p in model.named_parameters():
	if ignore and name in ignore:
	printlog('param {} ignored in broadcast'.format(name))
	continue
	# temply not add share adapter group
	assert p.task_specific + p.backbone_specific + p.neck_specific + p.decoder_specific <= 1.5, \
	"param could not be task_specific, backbone_specific, neck_specific, decoder_specific at same time"

	try:
	if p.task_specific:
	start_rank = -task_grp.rank() + dist.get_rank()
	printlog(f'broadcasting task-specific param {name}\tgroup_idx={task_grp}')
	broadcast(p, start_rank, group_idx=task_grp)
	elif p.adapter_specific:
	start_rank = -share_adapter_group.rank() + dist.get_rank()
	printlog(f'broadcasting adapter-specific param {name}\tgroup_idx={share_adapter_group}')
	broadcast(p, start_rank, group_idx=share_adapter_group)
	elif p.backbone_specific:
	start_rank = -share_backbone_group.rank() + dist.get_rank()
	printlog(f'broadcasting backbone-specific param {name}\tgroup_idx={share_backbone_group}')
	broadcast(p, start_rank, group_idx=share_backbone_group)
	elif p.neck_specific:
	start_rank = -share_neck_group.rank() + dist.get_rank()
	printlog(f'broadcasting neck-specific param {name}\tgroup_idx={share_neck_group}')
	broadcast(p, start_rank, group_idx=share_neck_group)
	elif p.decoder_specific:
	start_rank = -share_decoder_group.rank() + dist.get_rank()
	printlog(f'broadcasting decoder-specific param {name}\tgroup_idx={share_decoder_group}')
	broadcast(p, start_rank, group_idx=share_decoder_group)
	else:
	printlog(f'broadcasting non-specific param {name}')
	broadcast(p, 0)
	except:
	import pdb;pdb.set_trace()
	raise RuntimeError('param {} does not have task_specific or backbone_specific or neck_specific or decoder_specific'.format(name))

	for name,b in model.named_buffers():
	if ignore and name in ignore:
	printlog('buffer {} ignored in broadcast'.format(name))
	continue
	assert b.task_specific + b.backbone_specific + b.neck_specific + b.decoder_specific + b.adapter_specific <= 1, \
	"buffer could not be task_specific, backbone_specific, neck_specific, decoder_specific at same time"
	try:
	if b.task_specific:
	start_rank = -task_grp.rank() + dist.get_rank()
	printlog('broadcasting task-specific buffer {}'.format(name))
	broadcast(b, start_rank, group_idx=task_grp)
	elif b.adapter_specific:
	start_rank = -share_adapter_group.rank() + dist.get_rank()
	printlog(f'broadcasting adapter-specific param {name}\tgroup_idx={share_adapter_group}')
	broadcast(b, start_rank, group_idx=share_adapter_group)
	elif b.backbone_specific:
	start_rank = -share_backbone_group.rank() + dist.get_rank()
	printlog('broadcasting backbone-specific buffer {}'.format(name))
	broadcast(b, start_rank, group_idx=share_backbone_group)
	elif b.neck_specific:
	start_rank = -share_neck_group.rank() + dist.get_rank()
	printlog('broadcasting neck-specific buffer {}'.format(name))
	broadcast(b, start_rank, group_idx=share_neck_group)
	elif b.decoder_specific:
	start_rank = -share_decoder_group.rank() + dist.get_rank()
	printlog('broadcasting decoder-specific buffer {}'.format(name))
	broadcast(b, start_rank, group_idx=share_decoder_group)
	else:
	dist.broadcast(b, 0)
	except:
	import pdb; pdb.set_trace()
	raise RuntimeError('buffer {} does not have task_specific'.format(name, id(b)))
	else:
	for name,p in model.named_parameters():
	if ignore and name in ignore:
	printlog('param {} ignored in broadcast'.format(name))
	continue
	try:
	dist.broadcast(p, 0)
	except:
	raise RuntimeError('param {} does not have task_specific'.format(name))

	for name,b in model.named_buffers():
	if ignore and name in ignore:
	printlog('buffer {} ignored in broadcast'.format(name))
	continue
	try:
	dist.broadcast(b, 0)
	except:
	raise RuntimeError('buffer {} does not have task_specific'.format(name, id(b)))

	def broadcast_params_unihcpv2(model, task_grp, share_backbone_group, share_decoder_group,
	share_rgb_group, share_dense_labeling_group, share_sparse_labeling_group,
	share_text_group, share_video_group, ignore,
	share_modality_group=None):
	""" broadcast multi-task model parameters """
	if task_grp is not None or share_backbone_group is not None \
	or share_decoder_group is None or share_rgb_group is not None \
	or share_dense_labeling_group is not None or share_sparse_labeling_group is not None \
	or share_text_group is not None or share_video_group is not None or share_modality_group is not None:
	# import pdb;pdb.set_trace()
	for name,p in model.named_parameters():
	if ignore and name in ignore:
	printlog('param {} ignored in broadcast'.format(name))
	continue
	# temply not add share adapter group
	assert p.task_specific + p.modality_share + p.backbone_specific + p.decoder_specific + p.rgb_specific + \
	p.dense_labeling_specific + p.sparse_labeling_specific + p.text_specific + \
	p.video_specific <= 1.5, \
	"param could not be task_specific, backbone_specific, decoder_specific, modality_specific at same time"

	try:
	if p.task_specific:
	start_rank = -task_grp.rank() + dist.get_rank()
	printlog(f'broadcasting task-specific param {name}\tgroup_idx={task_grp}')
	broadcast(p, start_rank, group_idx=task_grp)
	elif p.modality_share:
	start_rank = -share_modality_group.rank() + dist.get_rank()
	printlog(f'broadcasting modality-share param {name}\tgroup_idx={share_modality_group}')
	broadcast(p, start_rank, group_idx=share_modality_group)
	elif p.backbone_specific:
	start_rank = -share_backbone_group.rank() + dist.get_rank()
	printlog(f'broadcasting backbone-specific param {name}\tgroup_idx={share_backbone_group}')
	broadcast(p, start_rank, group_idx=share_backbone_group)
	elif p.decoder_specific:
	start_rank = -share_decoder_group.rank() + dist.get_rank()
	printlog(f'broadcasting decoder-specific param {name}\tgroup_idx={share_decoder_group}')
	broadcast(p, start_rank, group_idx=share_decoder_group)
	elif p.rgb_specific:
	start_rank = -share_rgb_group.rank() + dist.get_rank()
	printlog(f'broadcasting rgb-specific param {name}\tgroup_idx={share_rgb_group}')
	broadcast(p, start_rank, group_idx=share_rgb_group)
	elif p.dense_labeling_specific:
	start_rank = -share_dense_labeling_group.rank() + dist.get_rank()
	printlog(f'broadcasting dense_labeling-specific param {name}\tgroup_idx={share_dense_labeling_group}')
	broadcast(p, start_rank, group_idx=share_dense_labeling_group)
	elif p.sparse_labeling_specific:
	start_rank = -share_sparse_labeling_group.rank() + dist.get_rank()
	printlog(f'broadcasting sparse_labeling-specific param {name}\tgroup_idx={share_sparse_labeling_group}')
	broadcast(p, start_rank, group_idx=share_sparse_labeling_group)
	elif p.text_specific:
	start_rank = -share_text_group.rank() + dist.get_rank()
	printlog(f'broadcasting text-specific param {name}\tgroup_idx={share_text_group}')
	broadcast(p, start_rank, group_idx=share_text_group)
	elif p.video_specific:
	start_rank = -share_video_group.rank() + dist.get_rank()
	printlog(f'broadcasting video-specific param {name}\tgroup_idx={share_video_group}')
	broadcast(p, start_rank, group_idx=share_rgb_group)
	else:
	printlog(f'broadcasting non-specific param {name}')
	broadcast(p, 0)
	except:
	import pdb;pdb.set_trace()
	raise RuntimeError('param {} does not have task_specific or backbone_specific or neck_specific or decoder_specific'.format(name))

	for name,b in model.named_buffers():
	if ignore and name in ignore:
	printlog('buffer {} ignored in broadcast'.format(name))
	continue
	assert b.task_specific + b.modality_share + b.backbone_specific + b.decoder_specific + b.rgb_specific + \
	b.dense_labeling_specific + b.sparse_labeling_specific + b.text_specific + \
	b.video_specific <= 1.5, \
	"buffer could not be task_specific, backbone_specific, decoder_specific, modality_specific at same time"
	try:
	if b.task_specific:
	start_rank = -task_grp.rank() + dist.get_rank()
	printlog('broadcasting task-specific buffer {}'.format(name))
	broadcast(b, start_rank, group_idx=task_grp)
	elif b.modality_share:
	start_rank = -share_modality_group.rank() + dist.get_rank()
	printlog('broadcasting modality-share buffer {}'.format(name))
	broadcast(b, start_rank, group_idx=share_modality_group)
	elif b.backbone_specific:
	start_rank = -share_backbone_group.rank() + dist.get_rank()
	printlog('broadcasting backbone-specific buffer {}'.format(name))
	broadcast(b, start_rank, group_idx=share_backbone_group)
	elif b.decoder_specific:
	start_rank = -share_decoder_group.rank() + dist.get_rank()
	printlog('broadcasting decoder-specific buffer {}'.format(name))
	broadcast(b, start_rank, group_idx=share_decoder_group)
	elif b.rgb_specific:
	start_rank = -share_rgb_group.rank() + dist.get_rank()
	printlog(f'broadcasting rgb-specific param {name}\tgroup_idx={share_rgb_group}')
	broadcast(b, start_rank, group_idx=share_rgb_group)
	elif b.dense_labeling_specific:
	start_rank = -share_dense_labeling_group.rank() + dist.get_rank()
	printlog(f'broadcasting dense_labeling-specific param {name}\tgroup_idx={share_dense_labeling_group}')
	broadcast(b, start_rank, group_idx=share_dense_labeling_group)
	elif b.sparse_labeling_specific:
	start_rank = -share_sparse_labeling_group.rank() + dist.get_rank()
	printlog(f'broadcasting sparse_labeling-specific param {name}\tgroup_idx={share_sparse_labeling_group}')
	broadcast(b, start_rank, group_idx=share_sparse_labeling_group)
	elif b.text_specific:
	start_rank = -share_text_group.rank() + dist.get_rank()
	printlog(f'broadcasting text-specific param {name}\tgroup_idx={share_text_group}')
	broadcast(b, start_rank, group_idx=share_text_group)
	elif b.video_specific:
	start_rank = -share_video_group.rank() + dist.get_rank()
	printlog(f'broadcasting video-specific param {name}\tgroup_idx={share_video_group}')
	broadcast(b, start_rank, group_idx=share_rgb_group)
	else:
	dist.broadcast(b, 0)
	except:
	raise RuntimeError('buffer {} does not have task_specific'.format(name, id(b)))
	else:
	for name,p in model.named_parameters():
	if ignore and name in ignore:
	printlog('param {} ignored in broadcast'.format(name))
	continue
	try:
	dist.broadcast(p, 0)
	except:
	raise RuntimeError('param {} does not have task_specific'.format(name))

	for name,b in model.named_buffers():
	if ignore and name in ignore:
	printlog('buffer {} ignored in broadcast'.format(name))
	continue
	try:
	dist.broadcast(b, 0)
	except:
	raise RuntimeError('buffer {} does not have task_specific'.format(name, id(b)))


	def find_free_port():
	s = socket.socket()
	s.bind(('', 0)) # Bind to a free port provided by the host.
	return s.getsockname()[1]

	def dist_init():
	import socket
	import time
	hostname = socket.gethostname()

	# if hostname == 'docker_debug': # hostname 是docker容器Ubuntu的主机名
	# initialize()
	# world_size = dist.get_world_size()
	# rank = dist.get_rank()
	# torch.cuda.set_device(dist.get_local_rank())
	# else:
	if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
	if int(os.environ["RANK"]) == 0:
	print('this task is not running on cluster!')
	rank = int(os.environ["RANK"])
	world_size = int(os.environ['WORLD_SIZE'])
	gpu = int(os.environ['LOCAL_RANK'])
	dist_url = 'env://'
	os.environ['LOCAL_SIZE'] = str(torch.cuda.device_count())
	addr = socket.gethostname()

	elif 'SLURM_PROCID' in os.environ:
	proc_id = int(os.environ['SLURM_PROCID'])
	if proc_id == 0:
	print('Init dist using slurm!')
	print("Job Id is {} on {} ".format(os.environ["SLURM_JOBID"], os.environ['SLURM_NODELIST']))
	ntasks = int(os.environ['SLURM_NTASKS'])
	node_list = os.environ['SLURM_NODELIST']
	num_gpus = torch.cuda.device_count()
	addr = subprocess.getoutput(
	'scontrol show hostname {} \| head -n1'.format(node_list))
	jobid = os.environ["SLURM_JOBID"]
	hostfile = "dist_url_" + jobid + ".txt"
	if proc_id == 0:
	tcp_port = str(find_free_port())
	print('write port {} to file: {} '.format(tcp_port, hostfile))
	with open(hostfile, "w") as f:
	f.write(tcp_port)
	else:
	print('read port from file: {}'.format(hostfile))
	while not os.path.exists(hostfile):
	time.sleep(1)
	time.sleep(2)
	with open(hostfile, "r") as f:
	tcp_port = f.read()

	os.environ['MASTER_PORT'] = str(tcp_port)
	os.environ['MASTER_ADDR'] = addr
	os.environ['WORLD_SIZE'] = str(ntasks)
	os.environ['RANK'] = str(proc_id)
	os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
	os.environ['LOCAL_SIZE'] = str(num_gpus)
	dist_url = 'env://'
	world_size = ntasks
	rank = proc_id
	gpu = proc_id % num_gpus
	else:
	print('Not using distributed mode')
	distributed = False
	return
	torch.cuda.set_device(gpu)
	dist_backend = 'nccl'
	# rank = int(os.environ["RANK"])
	# world_size = int(os.environ['WORLD_SIZE'])
	print('rank: {} addr: {} port: {}'.format(rank, addr, os.environ['MASTER_PORT']))
	torch.distributed.init_process_group(backend=dist_backend, init_method=dist_url,
	world_size=world_size, rank=rank)
	torch.distributed.barrier()
	if 'SLURM_PROCID' in os.environ and rank == 0:
	if os.path.isfile(hostfile):
	os.remove(hostfile)
	if world_size >= 1:
	# Setup the local process group (which contains ranks within the same machine)
	assert comm_._LOCAL_PROCESS_GROUP is None
	num_gpus = torch.cuda.device_count()
	num_machines = world_size // num_gpus
	for i in range(num_machines):
	ranks_on_i = list(range(i * num_gpus, (i + 1) * num_gpus))
	print('new_group: {}'.format(ranks_on_i))
	pg = torch.distributed.new_group(ranks_on_i)
	if rank in ranks_on_i:
	# if i == os.environ['SLURM_NODEID']:
	comm_._LOCAL_PROCESS_GROUP = pg

	return rank, world_size

	class DistributedGivenIterationSampler(Sampler):
	def __init__(self, dataset, total_iter, batch_size, world_size=None, rank=None, last_iter=-1,
	shuffle_strategy=0, random_seed=0, imageNumPerClass=4, ret_save_path=None):
	if world_size is None:
	world_size = dist.get_world_size()
	if rank is None:
	rank = dist.get_rank()
	assert rank < world_size
	sync_print('sampler: rank={}, world_size={}, random_seed={}'.format(rank, world_size, random_seed))
	self.dataset = dataset
	self.total_iter = total_iter
	self.batch_size = batch_size
	self.world_size = world_size
	self.rank = rank
	self.last_iter = last_iter
	self.shuffle_strategy = shuffle_strategy
	self.random_seed = random_seed
	self.imageNumPerClass = imageNumPerClass
	self.ret_save_path = ret_save_path
	self.task_name = self.dataset.task_name

	self.total_size = self.total_iter*self.batch_size

	self.call = 0

	# generate indices
	if self.ret_save_path is not None:
	self.this_ret_path = os.path.join(self.ret_save_path, '_'.join([self.task_name, str(self.world_size), str(self.rank)]) + ".pth.tar")
	if os.path.exists(self.this_ret_path):
	ret_file = torch.load(self.this_ret_path)
	# ensure this task and task size is unchanged
	if ret_file['task_name'] == self.task_name and ret_file['task_size'] == self.world_size and ret_file['task_rank'] == self.rank:
	printlog(" load task sampler from ------> {}".format(self.this_ret_path))
	self.indices = ret_file['ret_file']
	self.dataset.received_indices = True
	return
	else:
	printlog("sampler file ({}) is not existed, and will be generated now--->".format(self.this_ret_path))

	if self.shuffle_strategy in [0,1,3,4,6]:
	self.indices = self.gen_new_list()
	self.dataset.indices = self.indices
	self.dataset.received_indices = True
	elif self.shuffle_strategy == 2:
	self.indices = self.gen_s2()
	elif self.shuffle_strategy == 5:
	self.indices = self.gen_s5()
	else:
	raise Error("Invalid shuffle_strategy!") # todo: undefined 'Error'???

	if self.ret_save_path is not None and not os.path.exists(self.ret_save_path):
	self.save()

	def gen_s2(self):

	np.random.seed(self.rank) # set different random seed

	indices = []
	labels = self.dataset.labels
	printlog('using shuffle strategy 2, initializing class map...')
	class2id = collections.OrderedDict()
	for i,l in enumerate(labels):
	if l in class2id:
	class2id[l].append(i)
	else:
	class2id[l] = [i]
	keys = list(class2id.keys())
	## random shuffle class keys
	np.random.shuffle(keys)
	num_class = len(keys)
	printlog('class map done.')

	for i in range((self.last_iter+1)*self.batch_size, self.total_size):
	class_id = np.random.randint(0, num_class)
	this_num = len(class2id[keys[class_id]])
	inner_id = np.random.randint(0, this_num)
	# yield class2id[keys[class_id]][inner_id]
	indices.append(class2id[keys[class_id]][inner_id])

	return indices

	def gen_s5(self):
	np.random.seed(self.rank) # set different random seed

	indices = []
	labels = self.dataset.labels
	printlog('using shuffle strategy 5, initializing class map...')
	class2id = collections.OrderedDict()
	for i,l in enumerate(labels):
	if l in class2id:
	class2id[l].append(i)
	else:
	class2id[l] = [i]
	keys = list(class2id.keys())
	## random shuffle class keys
	np.random.shuffle(keys)
	num_class = len(keys)
	printlog('class map done.')
	printlog('{} class with {} samples in a batch!'.format(self.batch_size//self.imageNumPerClass, self.imageNumPerClass))

	for i in range((self.last_iter+1)*self.batch_size, self.total_size):
	if i % self.imageNumPerClass == 0:
	class_id = np.random.randint(0, num_class)
	this_num = len(class2id[keys[class_id]])
	inner_id = np.random.randint(0, this_num)
	# yield class2id[keys[class_id]][inner_id]
	indices.append(class2id[keys[class_id]][inner_id])

	return indices

	def __iter__(self):
	if self.call == 0:
	self.call = 1
	return iter(self.indices)
	# if self.shuffle_strategy in [2,5]:
	# return self.indices
	# else:
	# return iter(self.indices)
	else:
	raise RuntimeError("this sampler is not designed to be called more than once!!")

	def gen_new_list(self):

	# each process shuffle independently
	if self.shuffle_strategy == 0:

	np.random.seed(self.rank)

	indices = np.arange(len(self.dataset))
	indices = indices[:self.total_size]
	num_repeat = (self.total_size-1) // indices.shape[0] + 1
	indices = np.tile(indices, num_repeat)
	indices = indices[:self.total_size]

	for beg in range(0, self.total_size, len(self.dataset)):
	end = min(beg+len(self.dataset), self.total_size)
	np.random.shuffle(indices[beg:end])

	# each process shuffle all list with same seed, and pick one piece according to rank
	elif self.shuffle_strategy == 1:

	np.random.seed(self.random_seed)

	all_size = self.total_size * self.world_size
	indices = np.arange(len(self.dataset))
	indices = indices[:all_size]
	num_repeat = (all_size-1) // indices.shape[0] + 1
	indices = np.tile(indices, num_repeat)
	indices = indices[:all_size]

	np.random.shuffle(indices)
	beg = self.total_size * self.rank
	indices = indices[beg:beg+self.total_size]

	elif self.shuffle_strategy == 3:

	np.random.seed(0)

	all_size = self.total_size * self.world_size

	## make class map
	labels = self.dataset.labels
	class2id = collections.OrderedDict()
	for i,l in enumerate(labels):
	if l in class2id:
	class2id[l].append(i)
	else:
	class2id[l] = [i]
	## calc mean_num
	class_count = [len(x) for _,x in class2id.items()]
	mean_num = int(np.mean(class_count))
	## fill class to mean_num
	indices = []
	for _,v in class2id.items():
	if len(v) < mean_num:
	lack_num = mean_num - len(v)
	indices.extend(np.random.choice(v, lack_num))
	indices.extend(v)
	indices = np.array(indices)
	indices = indices[:all_size]
	printlog('using strategy 3, mean_num: {}, origin_len: {}, balanced_len: {}'.format(mean_num, len(self.dataset), len(indices)))

	num_repeat = (all_size-1) // indices.shape[0] + 1
	indices = np.tile(indices, num_repeat)
	indices = indices[:all_size]

	np.random.shuffle(indices)
	beg = self.total_size * self.rank
	indices = indices[beg:beg+self.total_size]

	elif self.shuffle_strategy == 6:

	np.random.seed(self.random_seed)

	labels = self.dataset.labels
	printlog('using shuffle strategy 6, initializing class map...')
	class2id = collections.defaultdict(list)
	for i,l in enumerate(labels):
	class2id[l].append(i)

	mini_indices = []
	for pid, idxs in class2id.items():
	if len(idxs) < self.imageNumPerClass:
	idxs = idxs + list(np.random.choice(idxs, size=self.imageNumPerClass-len(idxs), replace=True))
	elif len(idxs) % self.imageNumPerClass != 0:
	add_num = int(len(idxs) // self.imageNumPerClass + 1) * self.imageNumPerClass - len(idxs)
	idxs = idxs + list(np.random.choice(idxs, size=add_num, replace=True))

	assert len(idxs) % self.imageNumPerClass == 0
	mini_indices.extend([idxs[i:i+self.imageNumPerClass] for i in range(0, len(idxs), self.imageNumPerClass)])

	np.random.shuffle(mini_indices)
	indices = np.array(mini_indices).reshape(-1)

	all_size = self.total_size * self.world_size
	indices = indices[:all_size]
	num_repeat = (all_size-1) // indices.shape[0] + 1
	indices = np.tile(indices, num_repeat)
	indices = indices[:all_size]

	beg = self.total_size * self.rank
	indices = indices[beg:beg+self.total_size]
	elif self.shuffle_strategy == 7:

	np.random.seed(self.random_seed)

	labels = self.dataset.labels
	printlog('using shuffle strategy 7, initializing class map...')
	class2id = collections.defaultdict(list)
	for i,l in enumerate(labels):
	class2id[l].append(i)

	mini_indices = []
	for pid, idxs in class2id.items():
	if len(idxs) < self.imageNumPerClass:
	idxs = idxs + list(np.random.choice(idxs, size=self.imageNumPerClass-len(idxs), replace=True))
	elif len(idxs) % self.imageNumPerClass != 0:
	add_num = int(len(idxs) // self.imageNumPerClass + 1) * self.imageNumPerClass - len(idxs)
	idxs = idxs + list(np.random.choice(idxs, size=add_num, replace=True))

	assert len(idxs) % self.imageNumPerClass == 0
	mini_indices.extend([idxs[i:i+self.imageNumPerClass] for i in range(0, len(idxs), self.imageNumPerClass)])

	np.random.shuffle(mini_indices)
	indices = np.array(mini_indices).reshape(-1)

	all_size = self.total_size * self.world_size
	indices = indices[:all_size]
	num_repeat = (all_size-1) // indices.shape[0] + 1

	for repeat in range(num_repeat)-1:
	for pid, idxs in class2id.items():
	if len(idxs) < self.imageNumPerClass:
	idxs = idxs + list(np.random.choice(idxs, size=self.imageNumPerClass-len(idxs), replace=True))
	elif len(idxs) % self.imageNumPerClass != 0:
	add_num = int(len(idxs) // self.imageNumPerClass + 1) * self.imageNumPerClass - len(idxs)
	idxs = idxs + list(np.random.choice(idxs, size=add_num, replace=True))

	assert len(idxs) % self.imageNumPerClass == 0
	mini_indices.extend([idxs[i:i+self.imageNumPerClass] for i in range(0, len(idxs), self.imageNumPerClass)])

	np.random.shuffle(mini_indices)
	indices = np.array(mini_indices).reshape(-1)

	all_size = self.total_size * self.world_size
	indices = indices[:all_size]

	beg = self.total_size * self.rank
	indices = indices[beg:beg+self.total_size]
	elif self.shuffle_strategy == 8:

	np.random.seed(self.random_seed)

	labels = self.dataset.labels
	print('using shuffle strategy 8, initializing class map...')
	class2id = collections.defaultdict(list)
	for i,l in enumerate(labels):
	class2id[l].append(i)

	pids = set()
	cls_pids_idxs = collections.defaultdict(list)
	for pid, idxs in class2id.items():
	if len(idxs) < self.imageNumPerClass:
	idxs = idxs + list(np.random.choice(idxs, size=self.imageNumPerClass-len(idxs), replace=True))
	elif len(idxs) % self.imageNumPerClass != 0:
	add_num = int(len(idxs) // self.imageNumPerClass + 1) * self.imageNumPerClass - len(idxs)
	idxs = idxs + list(np.random.choice(idxs, size=add_num, replace=True))

	assert len(idxs) % self.imageNumPerClass == 0
	np.random.shuffle(idxs)
	idxs = [idxs[i:(i+self.imageNumPerClass)] for i in range(0, len(idxs), self.imageNumPerClass)]
	ptr = 0
	values = [ptr, idxs]
	cls_pids_idxs[pid] = values
	pids.add(pid)

	indices = []
	classnum_per_batch = self.batch_size // self.imageNumPerClass
	while len(pids) >= classnum_per_batch:
	batch = []
	sub_pids = []
	for i, pid in enumerate(pids):
	if i >= classnum_per_batch:
	break
	sub_pids.append(pid)

	for pid in sub_pids:
	ptr = cls_pids_idxs[pid][0]
	idxs = cls_pids_idxs[pid][1]
	batch.extend(idxs[ptr])
	if ptr + 1 >= len(idxs):
	pids.remove(pid)
	else:
	cls_pids_idxs[pid][0] += 1
	indices.append(batch)

	np.random.shuffle(indices)
	indices = np.array(indices).reshape(-1)

	all_size = self.total_size * self.world_size
	indices = indices[:all_size]
	num_repeat = (all_size-1) // indices.shape[0] + 1
	indices = np.tile(indices, num_repeat)
	indices = indices[:all_size]

	beg = self.total_size * self.rank
	indices = indices[beg:beg+self.total_size]
	else:
	raise RuntimeError('unknow shuffle strategy')

	assert len(indices) == self.total_size

	return indices[(self.last_iter+1)*self.batch_size:]

	def __len__(self):
	return self.total_size - (self.last_iter+1)*self.batch_size

	def save(self):
	torch.save({'task_name': self.task_name,
	'task_size': self.world_size,
	'task_rank': self.rank,
	'ret_file': self.indices}, self.this_ret_path)
	printlog("save sampler file ------> {}".format(self.this_ret_path))


	class RandomIdentitySampler(Sampler):
	"""
	Randomly sample N identities, then for each identity,
	randomly sample K instances, therefore batch size is N*K.
	Args:
	- data_source (list): list of (img_path, pid, camid).
	- imageNumPerClass (int): number of instances per identity in a batch.
	- batch_size (int): number of examples in a batch.
	"""
	def __init__(self, dataset, total_iter, batch_size, world_size=None, rank=None, last_iter=-1,
	shuffle_strategy=0, random_seed=0, imageNumPerClass=4, ret_save_path=None):
	self.batch_size = batch_size
	self.world_size = world_size if world_size is not None else dist.get_world_size()
	self.num_instances = imageNumPerClass
	self.num_pids_per_batch = self.batch_size // self.num_instances
	self.index_dic = defaultdict(list)
	self.random_seed = random_seed
	self.total_iter = total_iter
	self.total_size = self.total_iter*self.batch_size
	self.last_iter = last_iter
	self.dataset = dataset

	labels = self.dataset.labels
	printlog('using RandomIdentityBatchSampler, initializing class map...')
	self.index_dic = collections.defaultdict(list)
	for i,l in enumerate(labels):
	self.index_dic[l].append(i)

	self.pids = np.array(list(self.index_dic.keys()))
	self.rank = rank if rank is not None else dist.get_rank()

	def __iter__(self):
	np.random.seed(self.random_seed)
	self._seed = int(self.random_seed)
	final_idxs = self.sample_list()
	length = int(math.ceil(len(final_idxs) * 1.0 / self.world_size))
	#final_idxs = final_idxs[self.rank * length:(self.rank + 1) * length]
	final_idxs = self.__fetch_current_node_idxs(final_idxs, length)
	return iter(final_idxs)


	def __fetch_current_node_idxs(self, final_idxs, length):
	total_num = len(final_idxs)
	block_num = (length // self.batch_size)
	index_target = []
	for i in range(0, block_num * self.world_size, self.world_size):
	index = range(self.batch_size * self.rank + self.batch_size * i, min(self.batch_size * self.rank + self.batch_size * (i+1), total_num))
	index_target.extend(index)
	index_target_npy = np.array(index_target)
	final_idxs = list(np.array(final_idxs)[index_target_npy])
	return final_idxs


	def batch_sample_list(self):
	#np.random.seed(self._seed)
	avai_pids = copy.deepcopy(self.pids.tolist())
	batch_idxs_dict = {}
	batch_indices = []
	while len(avai_pids) >= self.num_pids_per_batch:
	selected_pids = np.random.choice(avai_pids, self.num_pids_per_batch, replace=False).tolist()
	for pid in selected_pids:
	if pid not in batch_idxs_dict or len(batch_idxs_dict[pid]) < self.num_instances:
	idxs = copy.deepcopy(self.index_dic[pid])
	if len(idxs) < self.num_instances:
	idxs = np.random.choice(idxs, size=self.num_instances, replace=True).tolist()
	np.random.shuffle(idxs)
	batch_idxs_dict[pid] = idxs

	avai_idxs = batch_idxs_dict[pid]
	for _ in range(self.num_instances):
	batch_indices.append(avai_idxs.pop(0))

	if len(avai_idxs) < self.num_instances: avai_pids.remove(pid)

	return batch_indices

	def sample_list(self):
	#np.random.seed(self._seed)
	all_size = self.total_size * self.world_size

	all_indices = list()

	while len(all_indices) <= all_size:
	all_indices.extend(self.batch_sample_list())

	return all_indices[:all_size]

	def __len__(self):
	return self.total_size - (self.last_iter+1)*self.batch_size


	class RandomIdentityBatchSampler(Sampler):
	"""
	Randomly sample N identities, then for each identity,
	randomly sample K instances, therefore batch size is N*K.
	Args:
	- data_source (list): list of (img_path, pid, camid).
	- imageNumPerClass (int): number of instances per identity in a batch.
	- batch_size (int): number of examples in a batch.
	"""
	def __init__(self, dataset, total_iter, batch_size, world_size=None, rank=None, last_iter=-1,
	shuffle_strategy=0, random_seed=0, imageNumPerClass=4, ret_save_path=None):
	self.batch_size = batch_size
	self.world_size = world_size if world_size is not None else dist.get_world_size()
	self.num_instances = imageNumPerClass
	self.num_pids_per_batch = self.batch_size // self.num_instances
	self.index_dic = defaultdict(list)
	self.random_seed = random_seed
	self.total_iter = total_iter
	self.total_size = self.total_iter*self.batch_size
	self.last_iter = last_iter
	self.dataset = dataset

	labels = self.dataset.labels
	printlog('using RandomIdentityBatchSampler, initializing class map...')
	self.index_dic = collections.defaultdict(list)
	for i,l in enumerate(labels):
	self.index_dic[l].append(i)

	self.pids = np.array(list(self.index_dic.keys()))
	self.rank = rank if rank is not None else dist.get_rank()

	def __iter__(self):
	np.random.seed(self.random_seed)
	self._seed = int(self.random_seed)
	final_idxs_batches = self.sample_list()
	final_idxs = self.__fetch_current_node_idxs(final_idxs_batches)
	return iter(final_idxs)


	def __fetch_current_node_idxs(self, final_idxs_batches):
	res = []
	for final_idxs in final_idxs_batches:
	total_num = len(final_idxs)
	length = int(math.ceil(len(final_idxs) * 1.0 / self.world_size))
	block_num = (length // self.batch_size)
	index_target = []
	for i in range(0, block_num * self.world_size, self.world_size):
	index = range(self.batch_size * self.rank + self.batch_size * i, min(self.batch_size * self.rank + self.batch_size * (i+1), total_num))
	index_target.extend(index)
	index_target_npy = np.array(index_target)
	final_idxs = list(np.array(final_idxs)[index_target_npy])
	res.extend(final_idxs)
	res = res[:self.total_size]
	return res


	def batch_sample_list(self):
	#np.random.seed(self._seed)
	avai_pids = copy.deepcopy(self.pids.tolist())
	batch_idxs_dict = {}
	batch_indices = []
	while len(avai_pids) >= self.num_pids_per_batch:
	selected_pids = np.random.choice(avai_pids, self.num_pids_per_batch, replace=False).tolist()
	for pid in selected_pids:
	if pid not in batch_idxs_dict or len(batch_idxs_dict[pid]) < self.num_instances:
	idxs = copy.deepcopy(self.index_dic[pid])
	if len(idxs) < self.num_instances:
	idxs = np.random.choice(idxs, size=self.num_instances, replace=True).tolist()
	np.random.shuffle(idxs)
	batch_idxs_dict[pid] = idxs

	avai_idxs = batch_idxs_dict[pid]
	for _ in range(self.num_instances):
	batch_indices.append(avai_idxs.pop(0))

	if len(avai_idxs) < self.num_instances: avai_pids.remove(pid)

	return batch_indices

	def sample_list(self):
	#np.random.seed(self._seed)
	all_size = self.total_size * self.world_size

	all_indices = list()
	all_indices_batch = list()

	while len(all_indices) <= all_size:
	all_indices.extend(self.batch_sample_list())
	all_indices_batch.append(self.batch_sample_list())

	return all_indices_batch

	def __len__(self):
	return self.total_size - (self.last_iter+1)*self.batch_size


	class RandomIdentityEpochBatchSampler(Sampler):
	"""
	Randomly sample N identities, then for each identity,
	randomly sample K instances, therefore batch size is N*K.
	Args:
	- data_source (list): list of (img_path, pid, camid).
	- imageNumPerClass (int): number of instances per identity in a batch.
	- batch_size (int): number of examples in a batch.
	"""
	def __init__(self, dataset, total_iter, batch_size, world_size=None, rank=None, last_iter=-1,
	shuffle_strategy=0, random_seed=0, imageNumPerClass=4, ret_save_path=None):
	self.batch_size = batch_size
	self.world_size = world_size if world_size is not None else dist.get_world_size()
	self.num_instances = imageNumPerClass
	self.num_pids_per_batch = self.batch_size // self.num_instances
	self.index_dic = defaultdict(list)
	self.random_seed = random_seed
	self.total_iter = total_iter
	self.total_size = self.total_iter*self.batch_size
	self.last_iter = last_iter
	self.dataset = dataset

	labels = self.dataset.labels
	printlog('using RandomIdentityBatchSampler, initializing class map...')
	self.index_dic = collections.defaultdict(list)
	for i,l in enumerate(labels):
	self.index_dic[l].append(i)

	self.pids = np.array(list(self.index_dic.keys()))
	self.rank = rank if rank is not None else dist.get_rank()

	# estimate number of examples in an epoch
	self.length = 0
	for pid in self.pids:
	idxs = self.index_dic[pid]
	num = len(idxs)
	if num < self.num_instances:
	num = self.num_instances
	self.length += num - num % self.num_instances
	self.length //= self.world_size

	def __iter__(self):
	np.random.seed(self.random_seed)
	self._seed = int(self.random_seed)
	final_idxs_batches = self.sample_list()
	length = int(math.ceil(len(final_idxs_batches) * 1.0 / self.world_size))
	final_idxs = self.__fetch_current_node_idxs(final_idxs_batches, length)
	self.length = len(final_idxs)
	return iter(final_idxs)

	def set_epoch(self, epoch):
	self.random_seed = self.random_seed + epoch

	def __fetch_current_node_idxs(self, final_idxs, length):
	total_num = len(final_idxs)
	block_num = (length // self.batch_size)
	index_target = []
	for i in range(0, block_num * self.world_size, self.world_size):
	index = range(self.batch_size * self.rank + self.batch_size * i, min(self.batch_size * self.rank + self.batch_size * (i+1), total_num))
	index_target.extend(index)
	index_target_npy = np.array(index_target)
	final_idxs = list(np.array(final_idxs)[index_target_npy])
	return final_idxs

	def sample_list(self):
	#np.random.seed(self._seed)
	avai_pids = copy.deepcopy(self.pids.tolist())
	batch_idxs_dict = {}
	batch_indices = []
	while len(avai_pids) >= self.num_pids_per_batch:
	selected_pids = np.random.choice(avai_pids, self.num_pids_per_batch, replace=False).tolist()
	for pid in selected_pids:
	if pid not in batch_idxs_dict or len(batch_idxs_dict[pid]) < self.num_instances:
	idxs = copy.deepcopy(self.index_dic[pid])
	if len(idxs) < self.num_instances:
	idxs = np.random.choice(idxs, size=self.num_instances, replace=True).tolist()
	np.random.shuffle(idxs)
	batch_idxs_dict[pid] = idxs

	avai_idxs = batch_idxs_dict[pid]
	for _ in range(self.num_instances):
	batch_indices.append(avai_idxs.pop(0))

	if len(avai_idxs) < self.num_instances: avai_pids.remove(pid)

	return batch_indices

	def __len__(self):
	return self.length


	class DistributedGivenSizeSampler(Sampler):
	def __init__(self, dataset, given_size=None, dup_shuffle=False, world_size=None, rank=None):
	if world_size is None:
	world_size = dist.get_world_size()
	if rank is None:
	rank = dist.get_rank()
	assert rank < world_size
	self.dataset = dataset
	self.dup_shuffle = dup_shuffle
	self.world_size = world_size
	self.rank = rank
	self.epoch = 0
	if given_size is None:
	self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.world_size))
	else:
	self.num_samples = int(math.ceil(given_size * 1.0 / self.world_size))
	self.total_size = self.num_samples * self.world_size

	if self.dup_shuffle:
	self.offset = 0
	self.g = torch.Generator()
	#self.g.manual_seed(self.rank)
	self.indices = self.gen_new_list(self.g)

	def __iter__(self):
	if self.dup_shuffle:
	if self.offset == self.world_size:
	self.indices = self.gen_new_list(self.g)
	self.offset = 0
	beg = self.offset*self.num_samples
	indices = self.indices[beg:beg+self.num_samples]
	self.offset += 1
	else:
	# deterministically shuffle based on epoch
	g = torch.Generator()
	g.manual_seed(self.epoch)
	indices = self.gen_new_list(g)
	#origin_indices = list(torch.randperm(len(self.dataset), generator=g))
	#indices = origin_indices[:]

	## add extra samples to meet self.total_size
	#indices = indices[:self.total_size] # if total_size < len(indices)
	#extra = self.total_size - len(origin_indices)
	#while self.total_size - len(indices) > 0:
	# intake = self.total_size - len(indices)
	# indices += origin_indices[:intake]
	#assert len(indices) == self.total_size

	# subsample
	offset = self.num_samples * self.rank
	indices = indices[offset:offset + self.num_samples]

	assert len(indices) == self.num_samples
	return iter(indices)

	def gen_new_list(self, g):
	origin_indices = list(torch.randperm(len(self.dataset), generator=g))
	indices = origin_indices[:]

	# add extra samples to meet self.total_size
	indices = indices[:self.total_size] # if total_size < len(indices)
	extra = self.total_size - len(origin_indices)
	while self.total_size - len(indices) > 0:
	intake = self.total_size - len(indices)
	indices += origin_indices[:intake]
	assert len(indices) == self.total_size
	return indices

	def __len__(self):
	return self.num_samples

	def set_epoch(self, epoch):
	self.epoch = epoch

	class DistributedGroupSampler(Sampler):
	"""Sampler that restricts data loading to a subset of the dataset.

	It is especially useful in conjunction with
	:class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
	process can pass a DistributedSampler instance as a DataLoader sampler,
	and load a subset of the original dataset that is exclusive to it.

	.. note::
	Dataset is assumed to be of constant size.

	Arguments:
	dataset: Dataset used for sampling.
	num_replicas (optional): Number of processes participating in
	distributed training.
	rank (optional): Rank of the current process within num_replicas.
	seed (int, optional): random seed used to shuffle the sampler if
	``shuffle=True``. This number should be identical across all
	processes in the distributed group. Default: 0.
	"""

	def __init__(self,
	dataset,
	batch_size=1,
	num_replicas=None,
	world_size=None,
	rank=None,
	total_iter=-1,
	random_seed=0,
	last_iter=-1):

	_rank, _num_replicas = rank, world_size
	if num_replicas is None:
	num_replicas = _num_replicas
	if rank is None:
	rank = _rank
	self.dataset = dataset
	self.samples_per_gpu = batch_size
	self.num_replicas = num_replicas
	self.rank = rank
	self.seed = random_seed if random_seed is not None else 0
	self.total_iter = total_iter
	self.batch_size = batch_size

	assert hasattr(self.dataset, 'flag')
	dataset_flag = self.dataset.flag
	self.flag = np.tile(dataset_flag, (total_iter - 1) * batch_size // len(self.dataset) + 1)[:total_iter * batch_size]
	self.flag = self.flag[:total_iter*self.batch_size]

	self.group_sizes = np.bincount(self.flag)

	self.num_samples = 0
	for i, j in enumerate(self.group_sizes):
	self.num_samples += int(
	math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
	self.num_replicas)) * self.samples_per_gpu

	self.total_size = self.num_samples * self.num_replicas

	def __iter__(self):
	# deterministically shuffle based on epoch
	g = torch.Generator()
	g.manual_seed(self.seed)

	indices = []
	for i, size in enumerate(self.group_sizes):
	if size > 0:
	indice = np.where(self.flag == i)[0]
	assert len(indice) == size
	# add .numpy() to avoid bug when selecting indice in parrots.
	# TODO: check whether torch.randperm() can be replaced by
	# numpy.random.permutation().
	indice = indice[list(
	torch.randperm(int(size), generator=g).numpy())].tolist()
	extra = int(
	math.ceil(
	size * 1.0 / self.samples_per_gpu / self.num_replicas)
	) * self.samples_per_gpu * self.num_replicas - len(indice)
	# pad indice
	tmp = indice.copy()
	for _ in range(extra // size):
	indice.extend(tmp)
	indice.extend(tmp[:extra % size])
	indices.extend(indice)

	assert len(indices) == self.total_size

	indices = [
	indices[j] for i in list(
	torch.randperm(
	len(indices) // self.samples_per_gpu, generator=g))
	for j in range(i * self.samples_per_gpu, (i + 1) *
	self.samples_per_gpu)
	]

	indices = [i % len(self.dataset.flag) for i in indices]

	# subsample
	offset = self.num_samples * self.rank
	indices = indices[offset:offset + self.num_samples]
	assert len(indices) == self.num_samples

	return iter(indices)

	def __len__(self):
	return self.num_samples

	class BatchSampler(Sampler):
	r"""Wraps another sampler to yield a mini-batch of indices.

	Args:
	sampler (Sampler or Iterable): Base sampler. Can be any iterable object
	batch_size (int): Size of mini-batch.
	drop_last (bool): If ``True``, the sampler will drop the last batch if
	its size would be less than ``batch_size``

	Example:
	>>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False))
	[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
	>>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=True))
	[[0, 1, 2], [3, 4, 5], [6, 7, 8]]
	"""

	def __init__(self, sampler, batch_size, drop_last):
	# Since collections.abc.Iterable does not check for `__getitem__`, which
	# is one way for an object to be an iterable, we don't do an `isinstance`
	# check here.
	if not isinstance(batch_size, int) or isinstance(batch_size, bool) or \
	batch_size <= 0:
	raise ValueError("batch_size should be a positive integer value, "
	"but got batch_size={}".format(batch_size))
	if not isinstance(drop_last, bool):
	raise ValueError("drop_last should be a boolean value, but got "
	"drop_last={}".format(drop_last))
	self.sampler = sampler
	self.batch_size = batch_size
	self.drop_last = drop_last
	self.flag = self.sampler.flag

	def __iter__(self):
	ret = []
	batch = []
	#from IPython import embed;embed()
	for idx in self.sampler:
	batch.append(idx)
	if len(batch) == self.batch_size:
	batch = tuple(batch)
	for i in batch:
	if self.flag[i]!=self.flag[batch[0]]:
	from IPython import embed;embed()
	ret.extend(batch)
	batch = []
	if len(batch) > 0 and not self.drop_last:
	ret.extend(batch)

	return iter(ret)

	def __len__(self):
	# Can only be called if self.sampler has __len__ implemented
	# We cannot enforce this condition, so we turn off typechecking for the
	# implementation below.
	# Somewhat related: see NOTE [ Lack of Default `__len__` in Python Abstract Base Classes ]
	if self.drop_last:
	return len(self.sampler) // self.batch_size
	else:
	return (len(self.sampler) + self.batch_size - 1) // self.batch_size

	class DistributedSequentialSampler(Sampler):
	def __init__(self, dataset, world_size=None, rank=None):
	if world_size == None:
	world_size = dist.get_world_size()
	if rank == None:
	rank = dist.get_rank()
	self.dataset = dataset
	self.world_size = world_size
	self.rank = rank
	assert len(self.dataset) >= self.world_size, f'{len(self.dataset)} vs {self.world_size}'
	sub_num = int(math.ceil(len(self.dataset) * 1.0 / self.world_size))
	self.beg = sub_num * self.rank
	self.end = min(self.beg+sub_num, len(self.dataset))

	def __iter__(self):
	indices = list(range(self.beg, self.end))
	return iter(indices)

	def __len__(self):
	return self.end - self.beg

	def bcast_value(value):
	v = torch.Tensor([value])
	dist.broadcast(v, root=0)
	return v.item()

	def gather_tensors(input_array):
	world_size = dist.get_world_size()
	rank = dist.get_rank()
	# gather shapes first
	myshape = input_array.shape
	mycount = input_array.size
	shape_tensor = torch.Tensor(np.array(myshape))
	all_shape = [torch.Tensor(np.array(myshape)) for i in range(world_size)]
	dist.gather(all_shape, shape_tensor, root=0)
	# compute largest shapes on rank0
	if rank == 0:
	all_shape = [x.numpy() for x in all_shape]
	all_count = [int(x.prod()) for x in all_shape] # use for unpadding
	all_shape = [list(map(int, x)) for x in all_shape] # use for unpadding
	max_count = max(all_count) # use for padding
	else:
	max_count = 0
	max_count = int(bcast_value(max_count))
	# padding tensors and gather them
	output_tensors = [torch.Tensor(max_count) for i in range(world_size)]
	padded_input_array = np.zeros(max_count)
	padded_input_array[:mycount] = input_array.reshape(-1)
	input_tensor = torch.Tensor(padded_input_array)
	dist.gather(output_tensors, input_tensor, root=0)
	# unpadding gathered tensors
	if rank == 0:
	padded_output = [x.numpy() for x in output_tensors]
	output = [x[:all_count[i]].reshape(all_shape[i]) for i,x in enumerate(padded_output)]
	else:
	output = None

	return output


	# not used in UNIHCP
	def allgatherv(input_tensor, flatten=False):
	world_size = dist.get_world_size()
	rank = dist.get_rank()
	# gather shapes first
	myshape = torch.Tensor([*input_tensor.size()]).long()
	mycount = input_tensor.numel()
	all_shape = [torch.zeros_like(myshape, dtype=torch.long) for i in range(world_size)]
	dist.all_gather(all_shape, myshape)
	# compute largest shapes
	all_count = [int(x.prod()) for x in all_shape] # use for unpadding
	max_count = max(all_count) # use for padding
	# padding tensors and allgather them
	output_tensors = [torch.zeros(max_count).to(input_tensor) for i in range(world_size)]
	padded_input = torch.zeros(max_count).to(input_tensor)
	padded_input[:mycount] = input_tensor.view(-1)
	dist.all_gather(output_tensors, padded_input)
	# unpadding gathered tensors
	if flatten:
	output = torch.cat([x[:all_count[i]] for i,x in enumerate(output_tensors)])
	else:
	output = [x[:all_count[i]].view(*all_shape[i]) for i,x in enumerate(output_tensors)]

	return output

	def simple_group_split(world_size, rank, num_groups):
	groups = []
	rank_list = np.split(np.arange(world_size), num_groups)
	rank_list = [list(map(int, x)) for x in rank_list]
	for i in range(num_groups):
	groups.append(dist.new_group(ranks=rank_list[i]))
	group_size = world_size // num_groups
	return groups[rank//group_size]

	def specific_group_split(world_size, rank, group_spec):
	## sanity check
	assert type(group_spec) is list
	assert all(map(lambda x: type(x) is int, group_spec))
	num_groups = len(group_spec)
	splits = np.sum(group_spec)
	assert world_size % splits == 0
	unit = int(world_size / splits)
	## split
	group_spec = [x*unit for x in group_spec]
	groups = []
	roots = []
	last = 0
	group_info = edict()
	for i,gs in enumerate(group_spec):
	ranks = list(map(int, np.arange(last, last+gs)))
	groups.append(dist.new_group(ranks=ranks))
	roots.append(last)
	if rank in ranks:
	group_info.group = groups[-1]
	group_info.task_size = gs
	group_info.task_id = i
	group_info.task_sub_id = rank - last
	group_info.task_root = last
	last += gs
	group_info.task_roots = roots
	group_info.num_groups = num_groups
	return group_info

	def vreduce(x, tensor, group=None):
	y = tensor.clone()
	if group is not None:
	dist.all_reduce(y, group=group)
	else:
	dist.all_reduce(y)
	x.update(y.item())

	def vgather(x_list, x):

	dist.all_gather(x_list,torch.Tensor([x]).cuda())



	def reduce_dict(input_dict, task_size, task_rank, group=None, average=True):
	"""
	Args:
	input_dict (dict): all the values will be reduced
	average (bool): whether to do average or sum
	Reduce the values in the dictionary from all processes so that all processes
	have the averaged results. Returns a dict with the same fields as
	input_dict, after reduction.
	"""
	world_size = task_size
	if world_size < 2:
	return input_dict
	with torch.no_grad():
	names = []
	values = []
	# sort the keys so that they are consistent across processes
	for k in sorted(input_dict.keys()):
	names.append(k)
	values.append(input_dict[k])
	values = torch.stack(values, dim=0)
	dist.all_reduce(values, group=group)
	if average:
	values /= world_size
	reduced_dict = {k: v for k, v in zip(names, values)}
	return reduced_dict