add_bos: false checkpointing_steps: null clip_grad_norm: -1 config_name: null dataset_config_name: null dataset_name: null do_eval: true eval_batch_size: 2 eval_file: self-seq/data/lima500_withsys.jsonl eval_steps: '100' gradient_accumulation_steps: 16 gradient_checkpointing: true learning_rate: 2.0e-05 logging_steps: 5 lora_alpha: 16 lora_dropout: 0.1 lora_rank: 64 low_cpu_mem_usage: false lr_scheduler_type: linear max_seq_length: 2048 max_train_steps: 2316 model_name_or_path: /nas/shared/NLP_A100/model/Meta-Llama-3-8B model_revision: main num_train_epochs: 3 output_dir: output/self-seq-Meta-Llama-3-8B-tulu100k_base_ours_new_llama70b/ overwrite_cache: false per_device_train_batch_size: 2 preprocessing_num_workers: 64 prompt_template: tulu reduce_loss: mean report_to: all resume_from_checkpoint: null sample_train: -1 seed: 42 timeout: 1800 tokenizer_name: /nas/shared/NLP_A100/model/Meta-Llama-3-8B tokenizer_revision: null train_file: self-seq/data/tuluv2/tuluv2_100k_base_ours_new.jsonl trust_remote_code: false use_8bit_optimizer: false use_flash_attn: true use_lora: false use_qlora: false use_slow_tokenizer: true warmup_ratio: 0.03 weight_decay: 0.0 with_tracking: true