|
""" distributed launcher adopted from torch.distributed.launch
|
|
usage example: https://github.com/facebookresearch/maskrcnn-benchmark
|
|
This enables using multiprocessing for each spawned process (as they are treated as main processes)
|
|
"""
|
|
import sys
|
|
import subprocess
|
|
from argparse import ArgumentParser, REMAINDER
|
|
|
|
from utils import str2bool, int2str
|
|
|
|
def parse_args():
|
|
parser = ArgumentParser(description="PyTorch distributed training launch "
|
|
"helper utilty that will spawn up "
|
|
"multiple distributed processes")
|
|
|
|
|
|
parser.add_argument('--n_GPUs', type=int, default=1, help='the number of GPUs for training')
|
|
|
|
|
|
parser.add_argument("training_script", type=str,
|
|
help="The full path to the single GPU training "
|
|
"program/script to be launched in parallel, "
|
|
"followed by all the arguments for the "
|
|
"training script")
|
|
|
|
|
|
parser.add_argument('training_script_args', nargs=REMAINDER)
|
|
return parser.parse_args()
|
|
|
|
def main():
|
|
args = parse_args()
|
|
|
|
processes = []
|
|
for rank in range(0, args.n_GPUs):
|
|
cmd = [sys.executable]
|
|
|
|
cmd.append(args.training_script)
|
|
cmd.extend(args.training_script_args)
|
|
|
|
cmd += ['--distributed', 'True']
|
|
cmd += ['--launched', 'True']
|
|
cmd += ['--n_GPUs', str(args.n_GPUs)]
|
|
cmd += ['--rank', str(rank)]
|
|
|
|
process = subprocess.Popen(cmd)
|
|
processes.append(process)
|
|
|
|
for process in processes:
|
|
process.wait()
|
|
if process.returncode != 0:
|
|
raise subprocess.CalledProcessError(returncode=process.returncode,
|
|
cmd=cmd)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|