{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 100, "global_step": 195, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 2.5000000000000004e-07, "logits/chosen": -2.3828954696655273, "logits/rejected": -2.2103500366210938, "logps/chosen": -351.30865478515625, "logps/rejected": -310.087646484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05, "learning_rate": 2.5e-06, "logits/chosen": -2.15350341796875, "logits/rejected": -2.057192087173462, "logps/chosen": -291.661865234375, "logps/rejected": -299.000244140625, "loss": 0.6901, "rewards/accuracies": 0.5208333134651184, "rewards/chosen": -0.00531815318390727, "rewards/margins": 0.006059659644961357, "rewards/rejected": -0.01137781422585249, "step": 10 }, { "epoch": 0.1, "learning_rate": 5e-06, "logits/chosen": -1.7294094562530518, "logits/rejected": -1.6358362436294556, "logps/chosen": -349.6874084472656, "logps/rejected": -371.9268798828125, "loss": 0.6485, "rewards/accuracies": 0.609375, "rewards/chosen": -0.4694043695926666, "rewards/margins": 0.10101622343063354, "rewards/rejected": -0.5704206228256226, "step": 20 }, { "epoch": 0.15, "learning_rate": 4.959823971496575e-06, "logits/chosen": -1.3581154346466064, "logits/rejected": -1.2683781385421753, "logps/chosen": -328.9931945800781, "logps/rejected": -367.46417236328125, "loss": 0.6227, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.3038724958896637, "rewards/margins": 0.21112871170043945, "rewards/rejected": -0.5150011777877808, "step": 30 }, { "epoch": 0.2, "learning_rate": 4.8405871765993435e-06, "logits/chosen": -1.0669946670532227, "logits/rejected": -0.6835187673568726, "logps/chosen": -354.5089416503906, "logps/rejected": -383.76422119140625, "loss": 0.5903, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.4168413579463959, "rewards/margins": 0.37733370065689087, "rewards/rejected": -0.7941750884056091, "step": 40 }, { "epoch": 0.26, "learning_rate": 4.646121984004666e-06, "logits/chosen": -0.7538890838623047, "logits/rejected": -0.34662288427352905, "logps/chosen": -361.6393737792969, "logps/rejected": -380.0272521972656, "loss": 0.59, "rewards/accuracies": 0.703125, "rewards/chosen": -0.479973167181015, "rewards/margins": 0.41160327196121216, "rewards/rejected": -0.8915762901306152, "step": 50 }, { "epoch": 0.31, "learning_rate": 4.382678665009028e-06, "logits/chosen": -0.7476059198379517, "logits/rejected": -0.4476490020751953, "logps/chosen": -321.3587951660156, "logps/rejected": -371.570556640625, "loss": 0.5849, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3474940359592438, "rewards/margins": 0.410334974527359, "rewards/rejected": -0.7578290104866028, "step": 60 }, { "epoch": 0.36, "learning_rate": 4.058724504646834e-06, "logits/chosen": -0.5304551124572754, "logits/rejected": 0.0068548740819096565, "logps/chosen": -372.43316650390625, "logps/rejected": -403.68011474609375, "loss": 0.5931, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6598731875419617, "rewards/margins": 0.44455790519714355, "rewards/rejected": -1.10443115234375, "step": 70 }, { "epoch": 0.41, "learning_rate": 3.684671656182497e-06, "logits/chosen": -0.7597763538360596, "logits/rejected": -0.30586355924606323, "logps/chosen": -384.52679443359375, "logps/rejected": -407.8342590332031, "loss": 0.5901, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.5050551891326904, "rewards/margins": 0.4280626177787781, "rewards/rejected": -0.9331178665161133, "step": 80 }, { "epoch": 0.46, "learning_rate": 3.272542485937369e-06, "logits/chosen": -0.6929324865341187, "logits/rejected": -0.2577061057090759, "logps/chosen": -342.2850036621094, "logps/rejected": -383.3541259765625, "loss": 0.5777, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5730774998664856, "rewards/margins": 0.38491758704185486, "rewards/rejected": -0.9579952359199524, "step": 90 }, { "epoch": 0.51, "learning_rate": 2.835583164544139e-06, "logits/chosen": -0.8866588473320007, "logits/rejected": -0.39027491211891174, "logps/chosen": -326.8385925292969, "logps/rejected": -374.0113525390625, "loss": 0.5616, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.4010140001773834, "rewards/margins": 0.5182436108589172, "rewards/rejected": -0.9192575216293335, "step": 100 }, { "epoch": 0.51, "eval_logits/chosen": -0.6312460899353027, "eval_logits/rejected": -0.05466047301888466, "eval_logps/chosen": -351.8179016113281, "eval_logps/rejected": -391.2296447753906, "eval_loss": 0.5503215789794922, "eval_rewards/accuracies": 0.7139999866485596, "eval_rewards/chosen": -0.6370265483856201, "eval_rewards/margins": 0.559273362159729, "eval_rewards/rejected": -1.1962999105453491, "eval_runtime": 396.9424, "eval_samples_per_second": 5.039, "eval_steps_per_second": 0.63, "step": 100 }, { "epoch": 0.56, "learning_rate": 2.3878379241237136e-06, "logits/chosen": -0.5016804933547974, "logits/rejected": -0.17540986835956573, "logps/chosen": -381.13250732421875, "logps/rejected": -445.72344970703125, "loss": 0.568, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.8623906970024109, "rewards/margins": 0.4544965624809265, "rewards/rejected": -1.3168871402740479, "step": 110 }, { "epoch": 0.61, "learning_rate": 1.9436976651092143e-06, "logits/chosen": -0.5300595164299011, "logits/rejected": -0.16820363700389862, "logps/chosen": -343.6223449707031, "logps/rejected": -390.7423400878906, "loss": 0.5643, "rewards/accuracies": 0.671875, "rewards/chosen": -0.5444029569625854, "rewards/margins": 0.469682514667511, "rewards/rejected": -1.0140855312347412, "step": 120 }, { "epoch": 0.67, "learning_rate": 1.5174374208651913e-06, "logits/chosen": -0.7855179309844971, "logits/rejected": -0.3058822751045227, "logps/chosen": -345.6529235839844, "logps/rejected": -394.87310791015625, "loss": 0.5708, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.4353795647621155, "rewards/margins": 0.5461079478263855, "rewards/rejected": -0.9814874529838562, "step": 130 }, { "epoch": 0.72, "learning_rate": 1.122757546369744e-06, "logits/chosen": -0.5981294512748718, "logits/rejected": 0.1230069175362587, "logps/chosen": -365.549560546875, "logps/rejected": -422.6253356933594, "loss": 0.553, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.6209978461265564, "rewards/margins": 0.6843111515045166, "rewards/rejected": -1.3053090572357178, "step": 140 }, { "epoch": 0.77, "learning_rate": 7.723433775328385e-07, "logits/chosen": -0.31050771474838257, "logits/rejected": 0.17936445772647858, "logps/chosen": -382.371826171875, "logps/rejected": -423.1304626464844, "loss": 0.567, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7858445048332214, "rewards/margins": 0.49087247252464294, "rewards/rejected": -1.276716947555542, "step": 150 }, { "epoch": 0.82, "learning_rate": 4.774575140626317e-07, "logits/chosen": -0.3516121506690979, "logits/rejected": 0.05455173924565315, "logps/chosen": -383.17694091796875, "logps/rejected": -414.36322021484375, "loss": 0.5914, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.7312983870506287, "rewards/margins": 0.4293293058872223, "rewards/rejected": -1.160627841949463, "step": 160 }, { "epoch": 0.87, "learning_rate": 2.4757783024395244e-07, "logits/chosen": -0.3427812159061432, "logits/rejected": 0.1810428947210312, "logps/chosen": -328.9323425292969, "logps/rejected": -408.287353515625, "loss": 0.5436, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5952991247177124, "rewards/margins": 0.7224096059799194, "rewards/rejected": -1.3177087306976318, "step": 170 }, { "epoch": 0.92, "learning_rate": 9.00928482603669e-08, "logits/chosen": -0.4558378756046295, "logits/rejected": 0.023540988564491272, "logps/chosen": -353.60015869140625, "logps/rejected": -400.02459716796875, "loss": 0.5798, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6851487159729004, "rewards/margins": 0.5590900778770447, "rewards/rejected": -1.2442388534545898, "step": 180 }, { "epoch": 0.97, "learning_rate": 1.006426501190233e-08, "logits/chosen": -0.32187455892562866, "logits/rejected": 0.002347037196159363, "logps/chosen": -363.7867126464844, "logps/rejected": -415.94952392578125, "loss": 0.5752, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.6913945078849792, "rewards/margins": 0.4907089173793793, "rewards/rejected": -1.1821033954620361, "step": 190 }, { "epoch": 1.0, "step": 195, "total_flos": 0.0, "train_loss": 0.5863106256876236, "train_runtime": 9239.1596, "train_samples_per_second": 2.706, "train_steps_per_second": 0.021 } ], "logging_steps": 10, "max_steps": 195, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }