|
args=$@ |
|
for arg in $args; do |
|
eval "$arg" |
|
done |
|
|
|
echo "model: ${model:=mistralai/Mistral-7B-v0.1}" |
|
echo "tokenizer: ${tokenizer:=mistralai/Mistral-7B-v0.1}" |
|
echo "project: ${project:=fla}" |
|
echo "type: ${type:=gla}" |
|
echo "data: ${data:=}" |
|
echo "name: ${name:=}" |
|
echo "cache: ${cache:=}" |
|
echo "seed: ${seed:=42}" |
|
echo "context: ${context:=2048}" |
|
echo "steps: ${steps:=0}" |
|
echo "save: ${save:=2048}" |
|
echo "limit: ${limit:=16}" |
|
echo "preprocessing: ${preprocessing:=32}" |
|
echo "workers: ${workers:=32}" |
|
echo "logging: ${logging:=32}" |
|
echo "config: ${config:=configs/deepspeed.yaml}" |
|
echo "push: ${push:=False}" |
|
|
|
echo "lr: ${lr:=3e-4}" |
|
echo "scheduler: ${scheduler:=cosine_with_min_lr}" |
|
echo "epochs: ${epochs:=1}" |
|
echo "optim: ${optim:=adamw_torch_fused}" |
|
echo "decay: ${decay:=0.01}" |
|
echo "beta1: ${beta1:=0.9}" |
|
echo "beta2: ${beta2:=0.95}" |
|
echo "norm: ${norm:=1.0}" |
|
echo "batch: ${batch:=32}" |
|
echo "update: ${update:=4}" |
|
echo "warmup: ${warmup:=512}" |
|
echo "path: ${path:=}" |
|
echo "checkpoint: ${checkpoint:=}" |
|
echo "node: ${node:=}" |
|
echo "rank: ${rank:=}" |
|
echo "ip: ${ip:=}" |
|
echo "port: ${port:=}" |
|
echo "nodes: ${nodes:=1}" |
|
|
|
params="--model_name_or_path $model \ |
|
--tokenizer $tokenizer \ |
|
--use_fast_tokenizer \ |
|
--do_train \ |
|
--dataset $data \ |
|
--context_length $context \ |
|
--streaming \ |
|
--preprocessing_num_workers $preprocessing \ |
|
--dataloader_num_workers $workers \ |
|
--dataloader_prefetch_factor 2 \ |
|
--ignore_data_skip \ |
|
--output_dir $path \ |
|
--overwrite_output_dir \ |
|
--logging_steps $logging \ |
|
--include_num_input_tokens_seen \ |
|
--save_steps $save \ |
|
--save_total_limit $limit \ |
|
--learning_rate $lr \ |
|
--lr_scheduler_type $scheduler \ |
|
--warmup_steps $warmup \ |
|
--optim $optim \ |
|
--weight_decay $decay \ |
|
--adam_beta1=$beta1 \ |
|
--adam_beta2=$beta2 \ |
|
--max_grad_norm $norm \ |
|
--num_train_epochs $epochs \ |
|
--per_device_train_batch_size $batch \ |
|
--gradient_accumulation_steps $update \ |
|
--seed $seed \ |
|
--logging_steps $logging \ |
|
--push_to_hub $push \ |
|
--bf16" |
|
|
|
if [ $steps -gt 0 ]; then |
|
params+=" --max_steps $steps" |
|
fi |
|
|
|
if [ "$name" != "" ]; then |
|
params+=" --dataset_name $name" |
|
fi |
|
if [ "$cache" != "" ]; then |
|
params+=" --cache_dir $cache" |
|
fi |
|
if [ "$checkpoint" != "" ]; then |
|
params+=" --resume_from_checkpoint $checkpoint" |
|
fi |
|
if [ "$WANDB_DISABLED" != "true" ]; then |
|
params+=" --report_to wandb \ |
|
--run_name $type.$(basename $path)" |
|
else |
|
params+=" --report_to none" |
|
fi |
|
|
|
NUM_GPUS=$(nvidia-smi --list-gpus | wc -l) |
|
echo "Launching training..." |
|
accelerate_params="" |
|
if [ "$rank" != "" ]; then |
|
accelerate_params+=" --machine_rank $rank \ |
|
--num_processes $((nodes * $NUM_GPUS)) \ |
|
--num_machines $nodes \ |
|
--main_process_ip $ip \ |
|
--main_process_port $port \ |
|
--same_network" |
|
fi |
|
|
|
if [[ $config == *"deepspeed"* ]]; then |
|
cat <<EOF > "configs/ds_config.json" |
|
{ |
|
"train_batch_size": "auto", |
|
"train_micro_batch_size_per_gpu": "auto", |
|
"gradient_accumulation_steps": "auto", |
|
"gradient_clipping": "auto", |
|
"zero_allow_untested_optimizer": true, |
|
"bf16": { |
|
"enabled": true |
|
}, |
|
"zero_optimization": { |
|
"stage": 2, |
|
"allgather_partitions": true, |
|
"allgather_bucket_size": 5e8, |
|
"reduce_scatter": true, |
|
"reduce_bucket_size": 5e8, |
|
"overlap_comm": false, |
|
"contiguous_gradients": true |
|
} |
|
} |
|
EOF |
|
cat <<EOF > $config |
|
compute_environment: LOCAL_MACHINE |
|
distributed_type: DEEPSPEED |
|
deepspeed_config: |
|
deepspeed_config_file: configs/ds_config.json |
|
zero3_init_flag: true |
|
machine_rank: 0 |
|
main_training_function: main |
|
num_machines: 1 |
|
num_processes: $NUM_GPUS |
|
use_cpu: false |
|
EOF |
|
fi |
|
if [[ $config == *"fsdp"* ]]; then |
|
cat <<EOF > $config |
|
compute_environment: LOCAL_MACHINE |
|
distributed_type: FSDP |
|
fsdp_config: |
|
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP |
|
fsdp_backward_prefetch: BACKWARD_PRE |
|
fsdp_forward_prefetch: false |
|
fsdp_cpu_ram_efficient_loading: true |
|
fsdp_offload_params: false |
|
fsdp_sharding_strategy: HYBRID_SHARD_ZERO2 |
|
fsdp_state_dict_type: SHARDED_STATE_DICT |
|
fsdp_sync_module_states: true |
|
fsdp_use_orig_params: true |
|
machine_rank: 0 |
|
main_training_function: main |
|
mixed_precision: bf16 |
|
num_machines: $nodes |
|
num_processes: $((nodes * $NUM_GPUS)) |
|
rdzv_backend: static |
|
same_network: true |
|
tpu_env: [] |
|
tpu_use_cluster: false |
|
tpu_use_sudo: false |
|
use_cpu: false |
|
EOF |
|
fi |
|
|
|
cat $config |
|
|
|
set -x |
|
mkdir -p $path |
|
cp * $path |
|
cp -r configs $path |
|
cp -r flame $path |
|
cp -r ../fla $path |
|
|
|
export TRANSFORMERS_OFFLINE=1 |
|
export HF_DATASETS_OFFLINE=1 |
|
if [ "$date" == "" ]; then |
|
date=$(date +%Y%m%d%H%M) |
|
fi |
|
export WANDB_RESUME=allow |
|
export WANDB_NAME="$type.$(basename $path)" |
|
export WANDB_PROJECT=$project |
|
export WANDB_RUN_ID="$WANDB_NAME-$date" |
|
accelerate launch $accelerate_params --config_file $config run.py $params |
|
|
|
echo "RUNNING DONE!" |
|
|