model_base_repository_id: "meta-llama/Meta-Llama-3-70B-Instruct" hub_model_id: "Weni/WeniGPT-Agents-Llama3-5.1.24-SFT" project: 'WeniGPT' #zeroshot or wenigpt dataset_id: "Weni/wenigpt-agent-sft-1.0.1" folder_name: "llama3" model_arch: "llama3" description: 'Experiment with DPO and Llama3 70b' task: 'SFT' quantization_type: 'bitsandbytes' metric_type: "text_generation" card_format: "simple_card" hub_reference_model_dpo_id: "" use_sloth: False use_fsdp: False # Dataset dataset_text_field: "prompt" language: ["pt"] prompt_format_file: "prompt_templates/agent_prompt_chat_sft_v3.yaml" chat_format: False chat_template: "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" # HuggingFace hub_strategy: 'all_checkpoints' # Wandb wandb_token: ${WANDB_TOKEN} wandb_run_name: "Llama3-5.1.24-SFT" wandb_project_name: "WeniGPT" group_name: "Sprint 42" wandb_notes: "Training 70b with the same params as 70b" # Lora use_lora: True bits: 4 use_exllama: True lora_r: 256 lora_alpha: 128 lora_dropout: 0.05 bias: "none" target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] task_type: "CAUSAL_LM" use_rslora: False # Bits and bytes load_in_4bit: True use_4bit: True bnb_4bit_use_double_quant: True bnb_4bit_quant_type: "nf4" bnb_4bit_compute_dtype: torch.bfloat16 # Training Args max_seq_length: 8192 num_train_epochs: 4 per_device_train_batch_size: 1 per_device_eval_batch_size: 1 gradient_accumulation_steps: 8 gradient_checkpointing: True optimizer: "AdamW" learning_rate: 2e-4 save_steps: 2 eval_steps: 2 logging_steps: 10 max_steps: 0 fp16: False bf16: True tf32: False packing: False lr_scheduler_type: "cosine" pretraining_tp: 1 mlm: False save_strategy: "steps" eval_strategy: "steps" load_best_model_at_end: True metric_for_best_model: 'eval_loss' greater_is_better: False prediction_loss_only: False save_safetensors: True max_grad_norm: 0.3 warmup_ratio: 0.03 weight_decay: 0.01 neftune_noise_alpha: 5 torch_dtype: torch.bfloat16 save_total_limit: 5 # Tokenizer padding: True padding_side: 'right' add_bos_token: False add_eos_token: True trust_remote_code: True use_auth_token: True eos_token: "<|end_of_text|>" pad_token: "<|end_of_text|>" stop_tokens: ["<|end_of_text|>", "<|eot_id|>"] # DPO dpo_beta: 0.1 dpo_max_length: 8192 dpo_max_target_length: 8192 dpo_max_prompt_length: 8192 dpo_loss_type: "sigmoid" dpo_label_smoothing: 0 # KTO kto_beta: 0.1 kto_desirable_weight: 1.0 kto_undesirable_weight: 1.0 kto_max_length: 1024 kto_max_completion_length: 1024 kto_max_prompt_length: 1024 # ORPO orpo_beta: 0.1 orpo_max_length: 8192 orpo_max_prompt_length: 8192 # merged models merged_model_id: "Weni/WeniGPT-Agents-Llama3-5.1.24-SFT-merged" low_cpu_mem_usage: True # awq quantization_awq: awq_destiny_model_id: "Weni/WeniGPT-Agents-Llama3-5.1.24-SFT-AWQ" safetensors: True config: zero_point: True q_group_size: 128 w_bit: 4 version: "GEMM" # Misc disable_tqdm: False include_inputs_for_metrics: False # Config PR pull_request: update_type: training #['training', 'code_update', 'bug'] delete_branch_after_push: true # Config Runpod config: VOLUMEINGB: 200 CONTAINERDISKINGB: 1000 NAME_POD: "Llama3 Agents WeniGPT 5.1.24-SFT" PORTS: "8888/http" VOLUMEMOUNTPATH: "/workspace" MIN_GPU_COUNT: 8 MAX_GPU_COUNT: 8 DOCKER_ARGS: "" IDS: "NVIDIA H100 80GB HBM3;NVIDIA H100 PCIe;" accelerate: deepspeed_config: offload_optimizer_device: "cpu" offload_param_device: "cpu" zero3_init_flag: false zero3_save_16bit_model: true stage3_gather_16bit_weights_on_model_save: true zero_stage: 3 distributed_type: "deepspeed" use_cpu: false