#!/bin/bash #SBATCH --nodes=1 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! #SBATCH --cpus-per-task=48 #SBATCH --gres=gpu:4 #SBATCH --partition=production-cluster #SBATCH --output=/fsx/loubna/logs/evaluation/leaderboard/%x-%j.out set -x -e source /admin/home/loubna/.bashrc conda activate brr4 # File Path setup echo "START TIME: $(date)" GPUS_PER_NODE=4 MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) MASTER_PORT=6000 NNODES=$SLURM_NNODES NODE_RANK=$SLURM_PROCID WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) model=$1 task=$2 org=$3 out_path=$4 CMD="\ /fsx/loubna/code/bigcode-evaluation-harness/main.py \ --model $org/$model \ --tasks $task \ --max_length_generation 512 \ --batch_size 50 \ --n_samples 50 \ --temperature 0.2 \ --precision bf16 \ --allow_code_execution \ --trust_remote_code \ --save_generations \ --use_auth_token \ --generation_only \ --save_generations_path $out_path/generations_$task\_$model.json \ " export LAUNCHER="accelerate launch \ --multi_gpu \ --num_machines $NNODES \ --num_processes $WORLD_SIZE \ --main_process_ip "$MASTER_ADDR" \ --main_process_port $MASTER_PORT \ --num_processes $WORLD_SIZE \ --machine_rank \$SLURM_PROCID \ --role $SLURMD_NODENAME: \ --rdzv_conf rdzv_backend=c10d \ --max_restarts 0 \ --tee 3 \ " # force crashing on nccl issues like hanging broadcast export NCCL_ASYNC_ERROR_HANDLING=1 # AWS specific export NCCL_PROTO=simple export RDMAV_FORK_SAFE=1 export FI_EFA_FORK_SAFE=1 export FI_EFA_USE_DEVICE_RDMA=1 export FI_PROVIDER=efa export FI_LOG_LEVEL=1 export NCCL_IB_DISABLE=1 export NCCL_SOCKET_IFNAME=ens echo $CMD SRUN_ARGS=" \ --wait=60 \ --kill-on-bad-exit=1 \ " clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER $CMD" echo "END TIME: $(date)"