{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 100, "global_step": 195, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 2.5000000000000004e-07, "logits/chosen": -2.5323238372802734, "logits/rejected": -2.550581216812134, "logps/chosen": -251.1321258544922, "logps/rejected": -304.1657409667969, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05, "learning_rate": 2.5e-06, "logits/chosen": -2.6382791996002197, "logits/rejected": -2.5627737045288086, "logps/chosen": -306.50714111328125, "logps/rejected": -308.0683898925781, "loss": 0.6928, "rewards/accuracies": 0.4618055522441864, "rewards/chosen": -0.00980505533516407, "rewards/margins": 0.002812173217535019, "rewards/rejected": -0.012617227621376514, "step": 10 }, { "epoch": 0.1, "learning_rate": 5e-06, "logits/chosen": -2.5880370140075684, "logits/rejected": -2.574676275253296, "logps/chosen": -298.3855285644531, "logps/rejected": -308.91644287109375, "loss": 0.6875, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.03349592164158821, "rewards/margins": 0.012716387398540974, "rewards/rejected": -0.04621230810880661, "step": 20 }, { "epoch": 0.15, "learning_rate": 4.959823971496575e-06, "logits/chosen": -2.5550596714019775, "logits/rejected": -2.451047420501709, "logps/chosen": -319.48651123046875, "logps/rejected": -308.0125732421875, "loss": 0.6752, "rewards/accuracies": 0.625, "rewards/chosen": -0.06745009124279022, "rewards/margins": 0.05002344399690628, "rewards/rejected": -0.1174735426902771, "step": 30 }, { "epoch": 0.2, "learning_rate": 4.8405871765993435e-06, "logits/chosen": -2.552633285522461, "logits/rejected": -2.4666683673858643, "logps/chosen": -318.9139099121094, "logps/rejected": -328.02813720703125, "loss": 0.6639, "rewards/accuracies": 0.578125, "rewards/chosen": -0.11392641067504883, "rewards/margins": 0.05299054831266403, "rewards/rejected": -0.16691696643829346, "step": 40 }, { "epoch": 0.26, "learning_rate": 4.646121984004666e-06, "logits/chosen": -2.5199167728424072, "logits/rejected": -2.494981527328491, "logps/chosen": -313.51763916015625, "logps/rejected": -337.84942626953125, "loss": 0.6479, "rewards/accuracies": 0.65625, "rewards/chosen": -0.15254025161266327, "rewards/margins": 0.10167612135410309, "rewards/rejected": -0.25421637296676636, "step": 50 }, { "epoch": 0.31, "learning_rate": 4.382678665009028e-06, "logits/chosen": -2.4956717491149902, "logits/rejected": -2.420666456222534, "logps/chosen": -326.60302734375, "logps/rejected": -344.81622314453125, "loss": 0.6472, "rewards/accuracies": 0.625, "rewards/chosen": -0.2229386270046234, "rewards/margins": 0.10586366802453995, "rewards/rejected": -0.32880228757858276, "step": 60 }, { "epoch": 0.36, "learning_rate": 4.058724504646834e-06, "logits/chosen": -2.4367499351501465, "logits/rejected": -2.3880956172943115, "logps/chosen": -304.6087951660156, "logps/rejected": -337.37860107421875, "loss": 0.6375, "rewards/accuracies": 0.640625, "rewards/chosen": -0.25156134366989136, "rewards/margins": 0.13926830887794495, "rewards/rejected": -0.3908296823501587, "step": 70 }, { "epoch": 0.41, "learning_rate": 3.684671656182497e-06, "logits/chosen": -2.4801056385040283, "logits/rejected": -2.366367816925049, "logps/chosen": -307.1095275878906, "logps/rejected": -320.8377990722656, "loss": 0.6347, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.2161990851163864, "rewards/margins": 0.17663030326366425, "rewards/rejected": -0.39282941818237305, "step": 80 }, { "epoch": 0.46, "learning_rate": 3.272542485937369e-06, "logits/chosen": -2.3947010040283203, "logits/rejected": -2.342723846435547, "logps/chosen": -295.4327392578125, "logps/rejected": -327.5935363769531, "loss": 0.629, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2965458929538727, "rewards/margins": 0.1947285681962967, "rewards/rejected": -0.4912744462490082, "step": 90 }, { "epoch": 0.51, "learning_rate": 2.835583164544139e-06, "logits/chosen": -2.314499855041504, "logits/rejected": -2.2232449054718018, "logps/chosen": -327.8091735839844, "logps/rejected": -362.3641052246094, "loss": 0.6232, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2903655767440796, "rewards/margins": 0.23061306774616241, "rewards/rejected": -0.5209786295890808, "step": 100 }, { "epoch": 0.51, "eval_logits/chosen": -2.2760729789733887, "eval_logits/rejected": -2.1507985591888428, "eval_logps/chosen": -339.0613708496094, "eval_logps/rejected": -350.98443603515625, "eval_loss": 0.6181190609931946, "eval_rewards/accuracies": 0.6679999828338623, "eval_rewards/chosen": -0.4065861999988556, "eval_rewards/margins": 0.20490887761116028, "eval_rewards/rejected": -0.6114951372146606, "eval_runtime": 384.0631, "eval_samples_per_second": 5.207, "eval_steps_per_second": 0.651, "step": 100 }, { "epoch": 0.56, "learning_rate": 2.3878379241237136e-06, "logits/chosen": -2.16359281539917, "logits/rejected": -2.0681121349334717, "logps/chosen": -342.3879089355469, "logps/rejected": -355.98919677734375, "loss": 0.6164, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4416044354438782, "rewards/margins": 0.23517628014087677, "rewards/rejected": -0.6767807602882385, "step": 110 }, { "epoch": 0.61, "learning_rate": 1.9436976651092143e-06, "logits/chosen": -2.197049856185913, "logits/rejected": -2.077195644378662, "logps/chosen": -353.0827941894531, "logps/rejected": -376.5859375, "loss": 0.6133, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4651293158531189, "rewards/margins": 0.2698659300804138, "rewards/rejected": -0.7349953651428223, "step": 120 }, { "epoch": 0.67, "learning_rate": 1.5174374208651913e-06, "logits/chosen": -2.047089099884033, "logits/rejected": -1.901155710220337, "logps/chosen": -341.2831115722656, "logps/rejected": -376.09326171875, "loss": 0.5841, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4085904657840729, "rewards/margins": 0.3427460491657257, "rewards/rejected": -0.7513364553451538, "step": 130 }, { "epoch": 0.72, "learning_rate": 1.122757546369744e-06, "logits/chosen": -1.988149642944336, "logits/rejected": -1.7608541250228882, "logps/chosen": -388.6386413574219, "logps/rejected": -387.81829833984375, "loss": 0.5888, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5027592182159424, "rewards/margins": 0.26467037200927734, "rewards/rejected": -0.7674296498298645, "step": 140 }, { "epoch": 0.77, "learning_rate": 7.723433775328385e-07, "logits/chosen": -1.8329941034317017, "logits/rejected": -1.6359748840332031, "logps/chosen": -354.4386291503906, "logps/rejected": -402.38970947265625, "loss": 0.5783, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.4927349090576172, "rewards/margins": 0.3869919180870056, "rewards/rejected": -0.879726767539978, "step": 150 }, { "epoch": 0.82, "learning_rate": 4.774575140626317e-07, "logits/chosen": -1.7938659191131592, "logits/rejected": -1.6463531255722046, "logps/chosen": -351.1708068847656, "logps/rejected": -407.2122497558594, "loss": 0.5802, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.5660358667373657, "rewards/margins": 0.35360515117645264, "rewards/rejected": -0.9196408987045288, "step": 160 }, { "epoch": 0.87, "learning_rate": 2.4757783024395244e-07, "logits/chosen": -1.7095705270767212, "logits/rejected": -1.6499723196029663, "logps/chosen": -335.1717224121094, "logps/rejected": -388.1880798339844, "loss": 0.593, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5228649377822876, "rewards/margins": 0.3808245062828064, "rewards/rejected": -0.9036895036697388, "step": 170 }, { "epoch": 0.92, "learning_rate": 9.00928482603669e-08, "logits/chosen": -1.8501991033554077, "logits/rejected": -1.6261276006698608, "logps/chosen": -354.8654479980469, "logps/rejected": -375.2087097167969, "loss": 0.5925, "rewards/accuracies": 0.6875, "rewards/chosen": -0.491665780544281, "rewards/margins": 0.341984361410141, "rewards/rejected": -0.8336501121520996, "step": 180 }, { "epoch": 0.97, "learning_rate": 1.006426501190233e-08, "logits/chosen": -1.7825686931610107, "logits/rejected": -1.5483803749084473, "logps/chosen": -352.39453125, "logps/rejected": -381.07086181640625, "loss": 0.5833, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.5239015817642212, "rewards/margins": 0.3186902701854706, "rewards/rejected": -0.8425917625427246, "step": 190 }, { "epoch": 1.0, "step": 195, "total_flos": 0.0, "train_loss": 0.6237345188091963, "train_runtime": 8932.105, "train_samples_per_second": 2.799, "train_steps_per_second": 0.022 } ], "logging_steps": 10, "max_steps": 195, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }