File size: 2,720 Bytes
8d015d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
"""
Experiment related stuffs
Act as a bridge between main and utils (logging, init directory, etc)
"""
from pathlib import Path
import os
import random
import numpy as np
import cupyx.distributed

import torch.distributed as dist
import torch


def init_experiment(cfgs):
    """
    in:
        cfgs: arguments such as hyperparameters and other
    out:
        --cfgs
    procedure to initialize experiment consisting of:
        - parse config file as a json dictionary
        - initialize logging
        - create dictionary to save everything
    """

    assert 'exp_name' in cfgs

    cfgs['summary_dir'] = os.path.join(cfgs['env']['save_dir'], "summaries")
    cfgs['checkpoint_dir'] = os.path.join(cfgs['env']['save_dir'], "checkpoints")
    cfgs['output_dir'] = os.path.join(cfgs['env']['save_dir'], "output")
    cfgs['log_dir'] = os.path.join(cfgs['env']['save_dir'], "logs")
    cfgs['cfg_dir'] = os.path.join(cfgs['env']['save_dir'], "cfgs")
    mode = cfgs["mode"]
    dataset = cfgs[f"{mode}_dataset"]['name']
    split = cfgs[f"{mode}_dataset"]['args']['split']
    cfgs['run_description'] = f'{mode}_{dataset}_{split}'

    Path(cfgs['summary_dir']).mkdir(parents=True, exist_ok=True)
    Path(cfgs['checkpoint_dir']).mkdir(parents=True, exist_ok=True)
    Path(cfgs['output_dir']).mkdir(parents=True, exist_ok=True)
    Path(cfgs['log_dir']).mkdir(parents=True, exist_ok=True)
    Path(cfgs['cfg_dir']).mkdir(parents=True, exist_ok=True)


def init_deterministic(random_seed=7):
    random.seed(random_seed)
    np.random.seed(random_seed)
    torch.random.manual_seed(random_seed)
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    torch.backends.cudnn.benchmark = True


def init_distributed_mode(cfgs):
    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
        cfgs['rank'] = int(os.environ["RANK"])
        cfgs['world_size'] = int(os.environ['WORLD_SIZE'])
        cfgs['gpu'] = int(os.environ['LOCAL_RANK'])
    elif 'SLURM_PROCID' in os.environ:
        cfgs['rank'] = int(os.environ['SLURM_PROCID'])
        cfgs['gpu'] = cfgs['rank'] % torch.cuda.device_count()
    else:
        print('Not using distributed mode')
        cfgs['distributed'] = False
        return

    cfgs['distributed'] = True
    torch.cuda.set_device(cfgs['gpu'])
    cfgs['dist_backend'] = 'nccl'
    print('| distributed init (rank {}): {}'.format(
        cfgs['rank'], cfgs['dist_url']), flush=True)
    dist.init_process_group(backend=cfgs['dist_backend'], init_method=cfgs['dist_url'],
                            world_size=cfgs['world_size'], rank=cfgs['rank'])
#     cupyx.distributed.NCCLBackend(n_devices=cfgs['world_size'], rank=cfgs['rank'])
    dist.barrier()