ahmedashrafay
/

staradapters-evaluations

Model card Files Files and versions Community

staradapters-evaluations / multiple_eval.slurm

ahmedashrafay

Add metrics/generations for ahmedashrafay/staradapters-python

f4eebef verified 21 days ago

raw

history blame

1.9 kB

	#!/bin/bash
	#SBATCH --nodes=1
	#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
	#SBATCH --cpus-per-task=48
	#SBATCH --gres=gpu:4
	#SBATCH --partition=production-cluster
	#SBATCH --output=/fsx/loubna/logs/evaluation/leaderboard/%x-%j.out

	set -x -e
	source /admin/home/loubna/.bashrc

	conda activate brr4

	# File Path setup
	echo "START TIME: $(date)"

	GPUS_PER_NODE=4
	MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST \| head -n 1)
	MASTER_PORT=6000
	NNODES=$SLURM_NNODES
	NODE_RANK=$SLURM_PROCID
	WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))


	model=$1
	task=$2
	org=$3
	out_path=$4

	CMD="\
	/fsx/loubna/code/bigcode-evaluation-harness/main.py \
	--model $org/$model \
	--tasks $task \
	--max_length_generation 512 \
	--batch_size 50 \
	--n_samples 50 \
	--temperature 0.2 \
	--precision bf16 \
	--allow_code_execution \
	--trust_remote_code \
	--save_generations \
	--use_auth_token \
	--generation_only \
	--save_generations_path $out_path/generations_$task\_$model.json \
	"

	export LAUNCHER="accelerate launch \
	--multi_gpu \
	--num_machines $NNODES \
	--num_processes $WORLD_SIZE \
	--main_process_ip "$MASTER_ADDR" \
	--main_process_port $MASTER_PORT \
	--num_processes $WORLD_SIZE \
	--machine_rank \$SLURM_PROCID \
	--role $SLURMD_NODENAME: \
	--rdzv_conf rdzv_backend=c10d \
	--max_restarts 0 \
	--tee 3 \
	"

	# force crashing on nccl issues like hanging broadcast
	export NCCL_ASYNC_ERROR_HANDLING=1

	# AWS specific
	export NCCL_PROTO=simple
	export RDMAV_FORK_SAFE=1
	export FI_EFA_FORK_SAFE=1
	export FI_EFA_USE_DEVICE_RDMA=1
	export FI_PROVIDER=efa
	export FI_LOG_LEVEL=1
	export NCCL_IB_DISABLE=1
	export NCCL_SOCKET_IFNAME=ens

	echo $CMD

	SRUN_ARGS=" \
	--wait=60 \
	--kill-on-bad-exit=1 \
	"

	clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER $CMD"

	echo "END TIME: $(date)"