amigov1 / example-train-medalpaca.sh
asach's picture
Upload folder using huggingface_hub
d727a17
raw
history blame
No virus
1.33 kB
#!/bin/bash
#SBATCH --job-name=alpaca-7 # Specify job name
#SBATCH --partition=pgpu # Specify partition name
#SBATCH --mem=0 # Use entire memory of node
#SBATCH --gres=gpu:8 # Generic resources; 8 GPU
#SBATCH --exclusive # Do not share node
#SBATCH --time=48:00:00 # Set a limit on the total run time
#SBATCH --output=logs_alp-7.o%j # File name for standard output
#SBATCH --error=errors_alp-7.e%j # File name for standard error output
cd /path/to/gitrepo
# activate conda environment
source /home/user/miniconda3/etc/profile.d/conda.sh
conda activate medalpaca
# recommended to manually set the hf cache dir, as the files are huge
export HF_HOME="/path/to/your/hfcache"
# feel free to adapt the below command, to run the training
# in 8bit with LoRA, fp16 with LoRA or bf16 and fsdp
torchrun --nproc_per_node=8 --master_port=9876 medalpaca/train.py \
--model 'decapoda-research/llama-7b-hf' \
--data_path 'medical_meadow_small.json' \
--output_dir './lora-alpaca-7b' \
--train_in_8bit False \
--use_lora False \
--bf16 True \
--tf32 True \
--fp16 False \
--gradient_checkpointing True \
--global_batch_size 256 \
--per_device_batch_size 4 \
--wandb_project 'medalpaca' \
--use_wandb False