asach
/

amigov1

Model card Files Files and versions Community

amigov1 / example-train-medalpaca.sh

asach's picture

Upload folder using huggingface_hub

d727a17 about 1 year ago

No virus

1.33 kB

	#!/bin/bash
	#SBATCH --job-name=alpaca-7 # Specify job name
	#SBATCH --partition=pgpu # Specify partition name
	#SBATCH --mem=0 # Use entire memory of node
	#SBATCH --gres=gpu:8 # Generic resources; 8 GPU
	#SBATCH --exclusive # Do not share node
	#SBATCH --time=48:00:00 # Set a limit on the total run time
	#SBATCH --output=logs_alp-7.o%j # File name for standard output
	#SBATCH --error=errors_alp-7.e%j # File name for standard error output

	cd /path/to/gitrepo

	# activate conda environment
	source /home/user/miniconda3/etc/profile.d/conda.sh
	conda activate medalpaca

	# recommended to manually set the hf cache dir, as the files are huge
	export HF_HOME="/path/to/your/hfcache"

	# feel free to adapt the below command, to run the training
	# in 8bit with LoRA, fp16 with LoRA or bf16 and fsdp

	torchrun --nproc_per_node=8 --master_port=9876 medalpaca/train.py \
	--model 'decapoda-research/llama-7b-hf' \
	--data_path 'medical_meadow_small.json' \
	--output_dir './lora-alpaca-7b' \
	--train_in_8bit False \
	--use_lora False \
	--bf16 True \
	--tf32 True \
	--fp16 False \
	--gradient_checkpointing True \
	--global_batch_size 256 \
	--per_device_batch_size 4 \
	--wandb_project 'medalpaca' \
	--use_wandb False