#!/bin/bash
torchrun --nproc_per_node=8 train.py  \
    --model_name_or_path tiiuae/falcon-7b \
    --bf16 True \
    --output_dir ./out_dir/  \
    --cache_dir ./hf-cache/ \
    --num_train_epochs 1  \
    --per_device_train_batch_size 2     \
    --per_device_eval_batch_size 2     \
    --gradient_accumulation_steps 8     \
    --evaluation_strategy "no"     \
    --save_strategy "steps"     \
    --save_steps 2000     \
    --save_total_limit 2     \
    --learning_rate 2e-5     \
    --weight_decay 0.1     \
    --warmup_ratio 0.03     \
    --lr_scheduler_type "cosine"     \
    --logging_steps 1     \
    --tf32 True \
    --max_steps 15000 \
    --model_max_length 2048 \
    --mem_freq 31 \
    --fsdp "full_shard auto_wrap"     \
    --fsdp_transformer_layer_cls_to_wrap 'DecoderLayer'