Spaces:
Runtime error
Runtime error
#SBATCH --job-name=trlx | |
#SBATCH --nodes=1 | |
#SBATCH --ntasks-per-node=1 | |
#SBATCH --partition=g40 | |
#SBATCH --mem=0 | |
#SBATCH --output=logs/%x_%j.out | |
#SBATCH --error=logs/%x_%j.err | |
#SBATCH --comment=carperai | |
#SBATCH --exclusive | |
# Example usage: | |
# sbatch slurm_train.sh TRLX_DIR | |
set -exuo pipefail | |
export LD_LIBRARY_PATH=/opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:$LD_LIBRARY_PATH | |
export PATH=/opt/amazon/efa/bin:/opt/amazon/openmpi/bin:$PATH | |
export NCCL_DEBUG=WARN | |
export NCCL_PROTO=simple | |
export FI_EFA_FORK_SAFE=1 | |
export FI_LOG_LEVEL=1 | |
export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn | |
export FI_EFA_ENABLE_SHM_TRANSFER=0 | |
export FI_PROVIDER=efa | |
export FI_EFA_TX_MIN_CRE DITS=64 | |
# export CUDA_LAUNCH_BLOCKING=1 | |
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` | |
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) | |
export MASTER_PORT=1234 | |
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` | |
TRLX_DIR=${1:-/fsx/home-amuzio/trlx} | |
TRAIN_SCRIPT=${2-scripts/accelerate_train_example.sh} # relative to TRLX_DIR | |
CONFIG_FILE=${3-configs/accelerate/zero2-bf16.yaml} # relative to TRLX_DIR | |
CONDA_DIR=${4:-/admin/home-amuzio/miniconda3} | |
CONDA_ENV_NAME=${5:-trlx} | |
pushd $TRLX_DIR | |
srun --comment carperai $TRAIN_SCRIPT \ | |
$CONFIG_FILE \ | |
$CONDA_DIR \ | |
$CONDA_ENV_NAME | |