{ "best_metric": 0.4133135974407196, "best_model_checkpoint": "./Mistral/17-03-24-Weni-ZeroShot-3.4.22-Mistral-7b-DPO-1.0.0_ZeroShot DPO Training a improved dataset and with learning rate 40x smaller than SFT-2_max_steps-216_batch_64_2024-03-17_ppid_9/checkpoint-100", "epoch": 4.081632653061225, "eval_steps": 50, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.82, "grad_norm": 8.00854778289795, "learning_rate": 1.5454545454545454e-05, "logits/chosen": -1.1193724870681763, "logits/rejected": -1.0836293697357178, "logps/chosen": -20.637556076049805, "logps/rejected": -19.033395767211914, "loss": 0.5919, "rewards/accuracies": 0.555468738079071, "rewards/chosen": 0.23351702094078064, "rewards/margins": 0.5474305748939514, "rewards/rejected": -0.3139135241508484, "step": 20 }, { "epoch": 1.63, "grad_norm": 4.945928573608398, "learning_rate": 1.8453608247422682e-05, "logits/chosen": -1.113696813583374, "logits/rejected": -1.0786501169204712, "logps/chosen": -19.557655334472656, "logps/rejected": -20.298036575317383, "loss": 0.46, "rewards/accuracies": 0.88177090883255, "rewards/chosen": 0.7501090168952942, "rewards/margins": 1.7748991250991821, "rewards/rejected": -1.0247900485992432, "step": 40 }, { "epoch": 2.04, "eval_logits/chosen": -1.109602451324463, "eval_logits/rejected": -1.0745993852615356, "eval_logps/chosen": -20.414562225341797, "eval_logps/rejected": -21.255586624145508, "eval_loss": 0.42441505193710327, "eval_rewards/accuracies": 0.9139610528945923, "eval_rewards/chosen": 0.6362072825431824, "eval_rewards/margins": 1.8193808794021606, "eval_rewards/rejected": -1.183173656463623, "eval_runtime": 80.1894, "eval_samples_per_second": 2.17, "eval_steps_per_second": 0.137, "step": 50 }, { "epoch": 2.45, "grad_norm": 3.1385602951049805, "learning_rate": 1.6391752577319588e-05, "logits/chosen": -1.0951989889144897, "logits/rejected": -1.0641714334487915, "logps/chosen": -20.11690902709961, "logps/rejected": -20.665721893310547, "loss": 0.4096, "rewards/accuracies": 0.9296875, "rewards/chosen": 0.5607342720031738, "rewards/margins": 1.7147486209869385, "rewards/rejected": -1.1540143489837646, "step": 60 }, { "epoch": 3.27, "grad_norm": 3.271662712097168, "learning_rate": 1.4329896907216495e-05, "logits/chosen": -1.1112940311431885, "logits/rejected": -1.0746426582336426, "logps/chosen": -19.945621490478516, "logps/rejected": -20.860157012939453, "loss": 0.3922, "rewards/accuracies": 0.932812511920929, "rewards/chosen": 0.5313547849655151, "rewards/margins": 1.7884050607681274, "rewards/rejected": -1.2570502758026123, "step": 80 }, { "epoch": 4.08, "grad_norm": 3.477855920791626, "learning_rate": 1.2268041237113405e-05, "logits/chosen": -1.109692096710205, "logits/rejected": -1.0762851238250732, "logps/chosen": -19.662954330444336, "logps/rejected": -20.73871612548828, "loss": 0.3807, "rewards/accuracies": 0.9532985687255859, "rewards/chosen": 0.6525470614433289, "rewards/margins": 1.9103381633758545, "rewards/rejected": -1.2577911615371704, "step": 100 }, { "epoch": 4.08, "eval_logits/chosen": -1.1172126531600952, "eval_logits/rejected": -1.081807017326355, "eval_logps/chosen": -20.498506546020508, "eval_logps/rejected": -21.386648178100586, "eval_loss": 0.4133135974407196, "eval_rewards/accuracies": 0.9310064911842346, "eval_rewards/chosen": 0.5942327976226807, "eval_rewards/margins": 1.8429381847381592, "eval_rewards/rejected": -1.2487053871154785, "eval_runtime": 80.3111, "eval_samples_per_second": 2.167, "eval_steps_per_second": 0.137, "step": 100 } ], "logging_steps": 20, "max_steps": 216, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }