Spaces:
Running
on
Zero
Running
on
Zero
File size: 9,540 Bytes
32287b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 |
import datetime
import functools
import os
import sys
from typing import List
from typing import Union
import pytz
import torch
import torch.distributed as tdist
import torch.multiprocessing as mp
__rank, __local_rank, __world_size, __device = 0, 0, 1, 'cpu'
__rank_str_zfill = '0'
__initialized = False
def initialized():
return __initialized
def __initialize(fork=False, backend='nccl', gpu_id_if_not_distibuted=0, timeout_minutes=30):
global __device
if not torch.cuda.is_available():
print(f'[dist initialize] cuda is not available, use cpu instead', file=sys.stderr)
return
elif 'RANK' not in os.environ:
torch.cuda.set_device(gpu_id_if_not_distibuted)
__device = torch.empty(1).cuda().device
print(f'[dist initialize] env variable "RANK" is not set, use {__device} as the device', file=sys.stderr)
return
# then 'RANK' must exist
global_rank, num_gpus = int(os.environ['RANK']), torch.cuda.device_count()
local_rank = global_rank % num_gpus
torch.cuda.set_device(local_rank)
# ref: https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/dist_utils.py#L29
"""
if mp.get_start_method(allow_none=True) is None:
method = 'fork' if fork else 'spawn'
print(f'[dist initialize] mp method={method}')
mp.set_start_method(method)
"""
tdist.init_process_group(backend=backend, timeout=datetime.timedelta(seconds=timeout_minutes * 60))
global __rank, __local_rank, __world_size, __initialized, __rank_str_zfill
__local_rank = local_rank
__rank, __world_size = tdist.get_rank(), tdist.get_world_size()
__rank_str_zfill = str(__rank).zfill(len(str(__world_size)))
__device = torch.device(local_rank)
__initialized = True
assert tdist.is_initialized(), 'torch.distributed is not initialized!'
print(f'[lrk={get_local_rank()}, rk={get_rank()}]')
def get_rank():
return __rank
def get_rank_given_group(group: tdist.ProcessGroup):
return tdist.get_rank(group=group)
def get_rank_str_zfill():
return __rank_str_zfill
def get_local_rank():
return __local_rank
def get_world_size():
return __world_size
def get_device():
return __device
def set_gpu_id(gpu_id: int):
if gpu_id is None: return
global __device
if isinstance(gpu_id, (str, int)):
torch.cuda.set_device(int(gpu_id))
__device = torch.empty(1).cuda().device
else:
raise NotImplementedError
def is_master():
return __rank == 0
def is_local_master():
return __local_rank == 0
def is_visualizer():
return __rank == 0
# return __rank == max(__world_size - 8, 0)
def parallelize(net, syncbn=False):
if syncbn:
net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net)
net = net.cuda()
net = torch.nn.parallel.DistributedDataParallel(net, device_ids=[get_local_rank()], find_unused_parameters=False, broadcast_buffers=False)
return net
def new_group(ranks: List[int]):
if __initialized:
return tdist.new_group(ranks=ranks)
return None
def new_local_machine_group():
if __initialized:
cur_subgroup, subgroups = tdist.new_subgroups()
return cur_subgroup
return None
def barrier():
if __initialized:
tdist.barrier()
def allreduce(t: torch.Tensor, async_op=False):
if __initialized:
if not t.is_cuda:
cu = t.detach().cuda()
ret = tdist.all_reduce(cu, async_op=async_op)
t.copy_(cu.cpu())
else:
ret = tdist.all_reduce(t, async_op=async_op)
return ret
return None
def allgather(t: torch.Tensor, cat=True) -> Union[List[torch.Tensor], torch.Tensor]:
if __initialized:
if not t.is_cuda:
t = t.cuda()
ls = [torch.empty_like(t) for _ in range(__world_size)]
tdist.all_gather(ls, t)
else:
ls = [t]
if cat:
ls = torch.cat(ls, dim=0)
return ls
def allgather_diff_shape(t: torch.Tensor, cat=True) -> Union[List[torch.Tensor], torch.Tensor]:
if __initialized:
if not t.is_cuda:
t = t.cuda()
t_size = torch.tensor(t.size(), device=t.device)
ls_size = [torch.empty_like(t_size) for _ in range(__world_size)]
tdist.all_gather(ls_size, t_size)
max_B = max(size[0].item() for size in ls_size)
pad = max_B - t_size[0].item()
if pad:
pad_size = (pad, *t.size()[1:])
t = torch.cat((t, t.new_empty(pad_size)), dim=0)
ls_padded = [torch.empty_like(t) for _ in range(__world_size)]
tdist.all_gather(ls_padded, t)
ls = []
for t, size in zip(ls_padded, ls_size):
ls.append(t[:size[0].item()])
else:
ls = [t]
if cat:
ls = torch.cat(ls, dim=0)
return ls
def broadcast(t: torch.Tensor, src_rank) -> None:
if __initialized:
if not t.is_cuda:
cu = t.detach().cuda()
tdist.broadcast(cu, src=src_rank)
t.copy_(cu.cpu())
else:
tdist.broadcast(t, src=src_rank)
def dist_fmt_vals(val: float, fmt: Union[str, None] = '%.2f') -> Union[torch.Tensor, List]:
if not initialized():
return torch.tensor([val]) if fmt is None else [fmt % val]
ts = torch.zeros(__world_size)
ts[__rank] = val
allreduce(ts)
if fmt is None:
return ts
return [fmt % v for v in ts.cpu().numpy().tolist()]
def master_only(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
force = kwargs.pop('force', False)
if force or is_master():
ret = func(*args, **kwargs)
else:
ret = None
barrier()
return ret
return wrapper
def local_master_only(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
force = kwargs.pop('force', False)
if force or is_local_master():
ret = func(*args, **kwargs)
else:
ret = None
barrier()
return ret
return wrapper
def for_visualize(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
if is_visualizer():
# with torch.no_grad():
ret = func(*args, **kwargs)
else:
ret = None
return ret
return wrapper
def finalize():
if __initialized:
tdist.destroy_process_group()
def init_distributed_mode(local_out_path, fork=False, only_sync_master=False, timeout_minutes=30):
try:
__initialize(fork=fork, timeout_minutes=timeout_minutes)
barrier()
except RuntimeError as e:
print(f'{"!"*80} dist init error (NCCL Error?), stopping training! {"!"*80}', flush=True)
raise e
if local_out_path is not None: os.makedirs(local_out_path, exist_ok=True)
_change_builtin_print(is_local_master())
if (is_master() if only_sync_master else is_local_master()) and local_out_path is not None and len(local_out_path):
sys.stdout, sys.stderr = BackupStreamToFile(local_out_path, for_stdout=True), BackupStreamToFile(local_out_path, for_stdout=False)
def _change_builtin_print(is_master):
import builtins as __builtin__
builtin_print = __builtin__.print
if type(builtin_print) != type(open):
return
def prt(*args, **kwargs):
force = kwargs.pop('force', False)
clean = kwargs.pop('clean', False)
deeper = kwargs.pop('deeper', False)
if is_master or force:
if not clean:
f_back = sys._getframe().f_back
if deeper and f_back.f_back is not None:
f_back = f_back.f_back
file_desc = f'{f_back.f_code.co_filename:24s}'[-24:]
time_str = datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai')).strftime('[%m-%d %H:%M:%S]')
builtin_print(f'{time_str} ({file_desc}, line{f_back.f_lineno:-4d})=>', *args, **kwargs)
else:
builtin_print(*args, **kwargs)
__builtin__.print = prt
class BackupStreamToFile(object):
def __init__(self, local_output_dir, for_stdout=True):
self.for_stdout = for_stdout
self.terminal_stream = sys.stdout if for_stdout else sys.stderr
fname = os.path.join(local_output_dir, 'b1_stdout.txt' if for_stdout else 'b2_stderr.txt')
existing = os.path.exists(fname)
self.file_stream = open(fname, 'a')
if existing:
time_str = datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai')).strftime('[%m-%d %H:%M:%S]')
self.file_stream.write('\n'*7 + '='*55 + f' RESTART {time_str} ' + '='*55 + '\n')
self.file_stream.flush()
os.system(f'ln -s {fname} /opt/tiger/run_trial/ >/dev/null 2>&1')
self.enabled = True
def write(self, message):
self.terminal_stream.write(message)
self.file_stream.write(message)
def flush(self):
self.terminal_stream.flush()
self.file_stream.flush()
def isatty(self):
return True
def close(self):
if not self.enabled:
return
self.enabled = False
self.file_stream.flush()
self.file_stream.close()
if self.for_stdout:
sys.stdout = self.terminal_stream
sys.stdout.flush()
else:
sys.stderr = self.terminal_stream
sys.stderr.flush()
def __del__(self):
self.close()
|