File size: 1,902 Bytes
f4eebef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=48
#SBATCH --gres=gpu:4
#SBATCH --partition=production-cluster
#SBATCH --output=/fsx/loubna/logs/evaluation/leaderboard/%x-%j.out
set -x -e
source /admin/home/loubna/.bashrc
conda activate brr4
# File Path setup
echo "START TIME: $(date)"
GPUS_PER_NODE=4
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6000
NNODES=$SLURM_NNODES
NODE_RANK=$SLURM_PROCID
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
model=$1
task=$2
org=$3
out_path=$4
CMD="\
/fsx/loubna/code/bigcode-evaluation-harness/main.py \
--model $org/$model \
--tasks $task \
--max_length_generation 512 \
--batch_size 50 \
--n_samples 50 \
--temperature 0.2 \
--precision bf16 \
--allow_code_execution \
--trust_remote_code \
--save_generations \
--use_auth_token \
--generation_only \
--save_generations_path $out_path/generations_$task\_$model.json \
"
export LAUNCHER="accelerate launch \
--multi_gpu \
--num_machines $NNODES \
--num_processes $WORLD_SIZE \
--main_process_ip "$MASTER_ADDR" \
--main_process_port $MASTER_PORT \
--num_processes $WORLD_SIZE \
--machine_rank \$SLURM_PROCID \
--role $SLURMD_NODENAME: \
--rdzv_conf rdzv_backend=c10d \
--max_restarts 0 \
--tee 3 \
"
# force crashing on nccl issues like hanging broadcast
export NCCL_ASYNC_ERROR_HANDLING=1
# AWS specific
export NCCL_PROTO=simple
export RDMAV_FORK_SAFE=1
export FI_EFA_FORK_SAFE=1
export FI_EFA_USE_DEVICE_RDMA=1
export FI_PROVIDER=efa
export FI_LOG_LEVEL=1
export NCCL_IB_DISABLE=1
export NCCL_SOCKET_IFNAME=ens
echo $CMD
SRUN_ARGS=" \
--wait=60 \
--kill-on-bad-exit=1 \
"
clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER $CMD"
echo "END TIME: $(date)" |