Spaces:
Running
on
Zero
Running
on
Zero
# Copyright (c) Meta Platforms, Inc. and affiliates. | |
# All rights reserved. | |
# | |
# This source code is licensed under the license found in the | |
# LICENSE file in the root directory of this source tree. | |
import argparse | |
import logging | |
import os | |
from pathlib import Path | |
from typing import List, Optional | |
import submitit | |
from dinov2.utils.cluster import ( | |
get_slurm_executor_parameters, | |
get_slurm_partition, | |
get_user_checkpoint_path, | |
) | |
logger = logging.getLogger("dinov2") | |
def get_args_parser( | |
description: Optional[str] = None, | |
parents: Optional[List[argparse.ArgumentParser]] = None, | |
add_help: bool = True, | |
) -> argparse.ArgumentParser: | |
parents = parents or [] | |
slurm_partition = get_slurm_partition() | |
parser = argparse.ArgumentParser( | |
description=description, | |
parents=parents, | |
add_help=add_help, | |
) | |
parser.add_argument( | |
"--ngpus", | |
"--gpus", | |
"--gpus-per-node", | |
default=8, | |
type=int, | |
help="Number of GPUs to request on each node", | |
) | |
parser.add_argument( | |
"--nodes", | |
"--nnodes", | |
default=2, | |
type=int, | |
help="Number of nodes to request", | |
) | |
parser.add_argument( | |
"--timeout", | |
default=2800, | |
type=int, | |
help="Duration of the job", | |
) | |
parser.add_argument( | |
"--partition", | |
default=slurm_partition, | |
type=str, | |
help="Partition where to submit", | |
) | |
parser.add_argument( | |
"--use-volta32", | |
action="store_true", | |
help="Request V100-32GB GPUs", | |
) | |
parser.add_argument( | |
"--comment", | |
default="", | |
type=str, | |
help="Comment to pass to scheduler, e.g. priority message", | |
) | |
parser.add_argument( | |
"--exclude", | |
default="", | |
type=str, | |
help="Nodes to exclude", | |
) | |
return parser | |
def get_shared_folder() -> Path: | |
user_checkpoint_path = get_user_checkpoint_path() | |
if user_checkpoint_path is None: | |
raise RuntimeError("Path to user checkpoint cannot be determined") | |
path = user_checkpoint_path / "experiments" | |
path.mkdir(exist_ok=True) | |
return path | |
def submit_jobs(task_class, args, name: str): | |
if not args.output_dir: | |
args.output_dir = str(get_shared_folder() / "%j") | |
Path(args.output_dir).mkdir(parents=True, exist_ok=True) | |
executor = submitit.AutoExecutor(folder=args.output_dir, slurm_max_num_timeout=30) | |
kwargs = {} | |
if args.use_volta32: | |
kwargs["slurm_constraint"] = "volta32gb" | |
if args.comment: | |
kwargs["slurm_comment"] = args.comment | |
if args.exclude: | |
kwargs["slurm_exclude"] = args.exclude | |
executor_params = get_slurm_executor_parameters( | |
nodes=args.nodes, | |
num_gpus_per_node=args.ngpus, | |
timeout_min=args.timeout, # max is 60 * 72 | |
slurm_signal_delay_s=120, | |
slurm_partition=args.partition, | |
**kwargs, | |
) | |
executor.update_parameters(name=name, **executor_params) | |
task = task_class(args) | |
job = executor.submit(task) | |
logger.info(f"Submitted job_id: {job.job_id}") | |
str_output_dir = os.path.abspath(args.output_dir).replace("%j", str(job.job_id)) | |
logger.info(f"Logs and checkpoints will be saved at: {str_output_dir}") | |