lambdanet / deblur /src /launch.py
hyliu's picture
Upload folder using huggingface_hub
e98653e verified
""" distributed launcher adopted from torch.distributed.launch
usage example: https://github.com/facebookresearch/maskrcnn-benchmark
This enables using multiprocessing for each spawned process (as they are treated as main processes)
"""
import sys
import subprocess
from argparse import ArgumentParser, REMAINDER
from utils import str2bool, int2str
def parse_args():
parser = ArgumentParser(description="PyTorch distributed training launch "
"helper utilty that will spawn up "
"multiple distributed processes")
parser.add_argument('--n_GPUs', type=int, default=1, help='the number of GPUs for training')
# positional
parser.add_argument("training_script", type=str,
help="The full path to the single GPU training "
"program/script to be launched in parallel, "
"followed by all the arguments for the "
"training script")
# rest from the training program
parser.add_argument('training_script_args', nargs=REMAINDER)
return parser.parse_args()
def main():
args = parse_args()
processes = []
for rank in range(0, args.n_GPUs):
cmd = [sys.executable]
cmd.append(args.training_script)
cmd.extend(args.training_script_args)
cmd += ['--distributed', 'True']
cmd += ['--launched', 'True']
cmd += ['--n_GPUs', str(args.n_GPUs)]
cmd += ['--rank', str(rank)]
process = subprocess.Popen(cmd)
processes.append(process)
for process in processes:
process.wait()
if process.returncode != 0:
raise subprocess.CalledProcessError(returncode=process.returncode,
cmd=cmd)
if __name__ == "__main__":
main()