staradapters-evaluations / multiple_eval.slurm
ahmedashrafay's picture
Add metrics/generations for ahmedashrafay/staradapters-python
f4eebef verified
raw
history blame
1.9 kB
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
#SBATCH --cpus-per-task=48
#SBATCH --gres=gpu:4
#SBATCH --partition=production-cluster
#SBATCH --output=/fsx/loubna/logs/evaluation/leaderboard/%x-%j.out
set -x -e
source /admin/home/loubna/.bashrc
conda activate brr4
# File Path setup
echo "START TIME: $(date)"
GPUS_PER_NODE=4
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6000
NNODES=$SLURM_NNODES
NODE_RANK=$SLURM_PROCID
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
model=$1
task=$2
org=$3
out_path=$4
CMD="\
/fsx/loubna/code/bigcode-evaluation-harness/main.py \
--model $org/$model \
--tasks $task \
--max_length_generation 512 \
--batch_size 50 \
--n_samples 50 \
--temperature 0.2 \
--precision bf16 \
--allow_code_execution \
--trust_remote_code \
--save_generations \
--use_auth_token \
--generation_only \
--save_generations_path $out_path/generations_$task\_$model.json \
"
export LAUNCHER="accelerate launch \
--multi_gpu \
--num_machines $NNODES \
--num_processes $WORLD_SIZE \
--main_process_ip "$MASTER_ADDR" \
--main_process_port $MASTER_PORT \
--num_processes $WORLD_SIZE \
--machine_rank \$SLURM_PROCID \
--role $SLURMD_NODENAME: \
--rdzv_conf rdzv_backend=c10d \
--max_restarts 0 \
--tee 3 \
"
# force crashing on nccl issues like hanging broadcast
export NCCL_ASYNC_ERROR_HANDLING=1
# AWS specific
export NCCL_PROTO=simple
export RDMAV_FORK_SAFE=1
export FI_EFA_FORK_SAFE=1
export FI_EFA_USE_DEVICE_RDMA=1
export FI_PROVIDER=efa
export FI_LOG_LEVEL=1
export NCCL_IB_DISABLE=1
export NCCL_SOCKET_IFNAME=ens
echo $CMD
SRUN_ARGS=" \
--wait=60 \
--kill-on-bad-exit=1 \
"
clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER $CMD"
echo "END TIME: $(date)"