File size: 2,720 Bytes
8d015d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
"""
Experiment related stuffs
Act as a bridge between main and utils (logging, init directory, etc)
"""
from pathlib import Path
import os
import random
import numpy as np
import cupyx.distributed
import torch.distributed as dist
import torch
def init_experiment(cfgs):
"""
in:
cfgs: arguments such as hyperparameters and other
out:
--cfgs
procedure to initialize experiment consisting of:
- parse config file as a json dictionary
- initialize logging
- create dictionary to save everything
"""
assert 'exp_name' in cfgs
cfgs['summary_dir'] = os.path.join(cfgs['env']['save_dir'], "summaries")
cfgs['checkpoint_dir'] = os.path.join(cfgs['env']['save_dir'], "checkpoints")
cfgs['output_dir'] = os.path.join(cfgs['env']['save_dir'], "output")
cfgs['log_dir'] = os.path.join(cfgs['env']['save_dir'], "logs")
cfgs['cfg_dir'] = os.path.join(cfgs['env']['save_dir'], "cfgs")
mode = cfgs["mode"]
dataset = cfgs[f"{mode}_dataset"]['name']
split = cfgs[f"{mode}_dataset"]['args']['split']
cfgs['run_description'] = f'{mode}_{dataset}_{split}'
Path(cfgs['summary_dir']).mkdir(parents=True, exist_ok=True)
Path(cfgs['checkpoint_dir']).mkdir(parents=True, exist_ok=True)
Path(cfgs['output_dir']).mkdir(parents=True, exist_ok=True)
Path(cfgs['log_dir']).mkdir(parents=True, exist_ok=True)
Path(cfgs['cfg_dir']).mkdir(parents=True, exist_ok=True)
def init_deterministic(random_seed=7):
random.seed(random_seed)
np.random.seed(random_seed)
torch.random.manual_seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)
torch.backends.cudnn.benchmark = True
def init_distributed_mode(cfgs):
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
cfgs['rank'] = int(os.environ["RANK"])
cfgs['world_size'] = int(os.environ['WORLD_SIZE'])
cfgs['gpu'] = int(os.environ['LOCAL_RANK'])
elif 'SLURM_PROCID' in os.environ:
cfgs['rank'] = int(os.environ['SLURM_PROCID'])
cfgs['gpu'] = cfgs['rank'] % torch.cuda.device_count()
else:
print('Not using distributed mode')
cfgs['distributed'] = False
return
cfgs['distributed'] = True
torch.cuda.set_device(cfgs['gpu'])
cfgs['dist_backend'] = 'nccl'
print('| distributed init (rank {}): {}'.format(
cfgs['rank'], cfgs['dist_url']), flush=True)
dist.init_process_group(backend=cfgs['dist_backend'], init_method=cfgs['dist_url'],
world_size=cfgs['world_size'], rank=cfgs['rank'])
# cupyx.distributed.NCCLBackend(n_devices=cfgs['world_size'], rank=cfgs['rank'])
dist.barrier()
|