# Copyright (c) 2015-present, Facebook, Inc. # All rights reserved. """ Misc functions, including distributed helpers. Mostly copy-paste from torchvision references. """ import io import os import time from collections import defaultdict, deque import datetime import torch import torch.distributed as dist import logging logger_initialized = {} def group_subnets_by_flops(data, flops_gap=1.0): sorted_data = {k: v for k, v in sorted(data.items(), key=lambda item: item[1])} candidate_idx = [] grouped_cands = [] last_flops = 0 for cfg_id, flops in sorted_data.items(): flops = flops / 1e9 if abs(last_flops - flops) > flops_gap: if len(candidate_idx) > 0: grouped_cands.append(sorted(candidate_idx)) candidate_idx = [int(cfg_id)] last_flops = flops else: candidate_idx.append(int(cfg_id)) if len(candidate_idx) > 0: grouped_cands.append(sorted(candidate_idx)) return grouped_cands def find_best_candidates(data): sorted_data = {k: v for k, v in sorted(data.items(), key=lambda item: item[1])} candidate_idx = [] last_flops = 0 for cfg_id, values in sorted_data.items(): flops, score = values if abs(last_flops - flops) > 1: candidate_idx.append(cfg_id) last_flops = flops else: if score > data[candidate_idx[-1]][1]: candidate_idx[-1] = cfg_id return candidate_idx def find_top_candidates(data, ratio=0.9): sorted_data = {k: v for k, v in sorted(data.items(), key=lambda item: item[1])} candidate_idx = [] grouped_cands = [] last_flops = 0 for cfg_id, values in sorted_data.items(): flops, score = values if abs(last_flops - flops) > 3: if len(candidate_idx) > 0: grouped_cands.append(candidate_idx) candidate_idx = [cfg_id] last_flops = flops else: candidate_idx.append(cfg_id) if len(candidate_idx) > 0: grouped_cands.append(candidate_idx) final_list = [] for group in grouped_cands: if len(group) == 1: final_list += list(map(int, group)) continue scores = torch.tensor([sorted_data[cfg_id][-1] for cfg_id in group]) indices = torch.argsort(scores, descending=True) num_selected = int(ratio*len(group)) if int(ratio*len(group)) > 0 else 1 top_ids = indices[:num_selected].tolist() selected = [group[idx] for idx in top_ids] final_list += list(map(int, selected)) return final_list def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'): """Initialize and get a logger by name. If the logger has not been initialized, this method will initialize the logger by adding one or two handlers, otherwise the initialized logger will be directly returned. During initialization, a StreamHandler will always be added. If `log_file` is specified and the process rank is 0, a FileHandler will also be added. Args: name (str): Logger name. log_file (str | None): The log filename. If specified, a FileHandler will be added to the logger. log_level (int): The logger level. Note that only the process of rank 0 is affected, and other processes will set the level to "Error" thus be silent most of the time. file_mode (str): The file mode used in opening log file. Defaults to 'w'. Returns: logging.Logger: The expected logger. """ logger = logging.getLogger(name) if name in logger_initialized: return logger # handle hierarchical names # e.g., logger "a" is initialized, then logger "a.b" will skip the # initialization since it is a child of "a". for logger_name in logger_initialized: if name.startswith(logger_name): return logger stream_handler = logging.StreamHandler() handlers = [stream_handler] if dist.is_available() and dist.is_initialized(): rank = dist.get_rank() else: rank = 0 # only rank 0 will add a FileHandler if rank == 0 and log_file is not None: # Here, the default behaviour of the official logger is 'a'. Thus, we # provide an interface to change the file mode to the default # behaviour. file_handler = logging.FileHandler(log_file, file_mode) handlers.append(file_handler) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') for handler in handlers: handler.setFormatter(formatter) handler.setLevel(log_level) logger.addHandler(handler) if rank == 0: logger.setLevel(log_level) else: logger.setLevel(logging.ERROR) logger_initialized[name] = True return logger def get_root_logger(log_file=None, log_level=logging.INFO): """Get the root logger. The logger will be initialized if it has not been initialized. By default a StreamHandler will be added. If `log_file` is specified, a FileHandler will also be added. The name of the root logger is the top-level package name, e.g., "mmseg". Args: log_file (str | None): The log filename. If specified, a FileHandler will be added to the root logger. log_level (int): The root logger level. Note that only the process of rank 0 is affected, while other processes will set the level to "Error" and be silent most of the time. Returns: logging.Logger: The root logger. """ logger = get_logger(name='snnet', log_file=log_file, log_level=log_level) return logger class SmoothedValue(object): """Track a series of values and provide access to smoothed values over a window or the global series average. """ def __init__(self, window_size=20, fmt=None): if fmt is None: fmt = "{median:.4f} ({global_avg:.4f})" self.deque = deque(maxlen=window_size) self.total = 0.0 self.count = 0 self.fmt = fmt def update(self, value, n=1): self.deque.append(value) self.count += n self.total += value * n def synchronize_between_processes(self): """ Warning: does not synchronize the deque! """ if not is_dist_avail_and_initialized(): return t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') dist.barrier() dist.all_reduce(t) t = t.tolist() self.count = int(t[0]) self.total = t[1] @property def median(self): d = torch.tensor(list(self.deque)) return d.median().item() @property def avg(self): d = torch.tensor(list(self.deque), dtype=torch.float32) return d.mean().item() @property def global_avg(self): return self.total / self.count @property def max(self): return max(self.deque) @property def value(self): return self.deque[-1] def __str__(self): return self.fmt.format( median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value) class MetricLogger(object): def __init__(self, delimiter="\t", logger=None): self.meters = defaultdict(SmoothedValue) self.delimiter = delimiter self.logger = logger def update(self, **kwargs): for k, v in kwargs.items(): if isinstance(v, torch.Tensor): v = v.item() assert isinstance(v, (float, int)) self.meters[k].update(v) def __getattr__(self, attr): if attr in self.meters: return self.meters[attr] if attr in self.__dict__: return self.__dict__[attr] raise AttributeError("'{}' object has no attribute '{}'".format( type(self).__name__, attr)) def __str__(self): loss_str = [] for name, meter in self.meters.items(): loss_str.append( "{}: {}".format(name, str(meter)) ) return self.delimiter.join(loss_str) def synchronize_between_processes(self): for meter in self.meters.values(): meter.synchronize_between_processes() def add_meter(self, name, meter): self.meters[name] = meter def log_every(self, iterable, print_freq, header=None): i = 0 if not header: header = '' start_time = time.time() end = time.time() iter_time = SmoothedValue(fmt='{avg:.4f}') data_time = SmoothedValue(fmt='{avg:.4f}') space_fmt = ':' + str(len(str(len(iterable)))) + 'd' log_msg = [ header, '[{0' + space_fmt + '}/{1}]', 'eta: {eta}', '{meters}', 'time: {time}', 'data: {data}' ] if torch.cuda.is_available(): log_msg.append('max mem: {memory:.0f}') log_msg = self.delimiter.join(log_msg) MB = 1024.0 * 1024.0 for obj in iterable: data_time.update(time.time() - end) yield obj iter_time.update(time.time() - end) if i % print_freq == 0 or i == len(iterable) - 1: eta_seconds = iter_time.global_avg * (len(iterable) - i) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if torch.cuda.is_available(): self.logger.info(log_msg.format( i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time), memory=torch.cuda.max_memory_allocated() / MB)) else: self.logger.info(log_msg.format( i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time))) i += 1 end = time.time() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) self.logger.info('{} Total time: {} ({:.4f} s / it)'.format( header, total_time_str, total_time / len(iterable))) def _load_checkpoint_for_ema(model_ema, checkpoint): """ Workaround for ModelEma._load_checkpoint to accept an already-loaded object """ mem_file = io.BytesIO() torch.save({'state_dict_ema':checkpoint}, mem_file) mem_file.seek(0) model_ema._load_checkpoint(mem_file) def setup_for_distributed(is_master): """ This function disables printing when not in master process """ import builtins as __builtin__ builtin_print = __builtin__.print def print(*args, **kwargs): force = kwargs.pop('force', False) if is_master or force: builtin_print(*args, **kwargs) __builtin__.print = print def is_dist_avail_and_initialized(): if not dist.is_available(): return False if not dist.is_initialized(): return False return True def get_world_size(): if not is_dist_avail_and_initialized(): return 1 return dist.get_world_size() def get_rank(): if not is_dist_avail_and_initialized(): return 0 return dist.get_rank() def is_main_process(): return get_rank() == 0 def save_on_master(*args, **kwargs): if is_main_process(): torch.save(*args, **kwargs) def init_distributed_mode(args): if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: args.rank = int(os.environ["RANK"]) args.world_size = int(os.environ['WORLD_SIZE']) args.gpu = int(os.environ['LOCAL_RANK']) elif 'SLURM_PROCID' in os.environ: args.rank = int(os.environ['SLURM_PROCID']) args.gpu = args.rank % torch.cuda.device_count() else: print('Not using distributed mode') args.distributed = False return args.distributed = True torch.cuda.set_device(args.gpu) args.dist_backend = 'nccl' print('| distributed init (rank {}): {}'.format( args.rank, args.dist_url), flush=True) torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) torch.distributed.barrier() setup_for_distributed(args.rank == 0) import json def save_on_master_eval_res(log_stats, output_dir): if is_main_process(): with open(output_dir, 'a') as f: f.write(json.dumps(log_stats) + "\n")