{ "accelerator_kwargs": {}, "adap_kl_ctrl": false, "backward_batch_size": 5, "batch_size": 60, "cliprange": 0.2, "cliprange_value": 0.2, "compare_steps": 1, "early_stopping": true, "exp_name": "morlhf", "forward_batch_size": null, "gamma": 1, "global_backward_batch_size": 20, "global_batch_size": 240, "gradient_accumulation_steps": 5, "horizon": 10000, "init_kl_coef": 0.1, "is_encoder_decoder": false, "is_peft_model": true, "kl_penalty": "kl", "lam": 0.95, "learning_rate": 1e-05, "log_with": "wandb", "max_grad_norm": 0.5, "mini_batch_size": 1, "model_name": "unsloth/Qwen2.5-7B", "optimize_cuda_cache": true, "optimize_device_cache": false, "ppo_epochs": 4, "project_kwargs": {}, "push_to_hub_if_best_kwargs": {}, "query_dataset": "imdb", "ratio_threshold": 10.0, "remove_unused_columns": true, "reward_model": "sentiment-analysis:lvwerra/distilbert-imdb", "score_clip": null, "seed": 0, "steps": 20000, "target": 3, "target_kl": 1, "task_name": null, "tracker_kwargs": { "wandb": { "name": "maxmin-dpo-init-kl-coef-0.1-fix-lora" } }, "tracker_project_name": "morlhf", "use_score_norm": false, "use_score_scaling": false, "vf_coef": 0.1, "whiten_rewards": false, "world_size": 4 }